diff --git a/.asf.yaml b/.asf.yaml index 40b961dc6e885..a1c6434587703 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -21,10 +21,12 @@ github: collaborators: - anjakefala - benibus - - danepitkin - davisusanibar + - jbonofre - js8544 - vibhatha + - zanmato1984 + - ZhangHuiGui notifications: commits: commits@arrow.apache.org diff --git a/.env b/.env index ab2e4b4fbe7fb..27474b2c73199 100644 --- a/.env +++ b/.env @@ -86,9 +86,6 @@ ARROW_R_DEV=TRUE R_PRUNE_DEPS=FALSE TZ=UTC -# Any non-empty string will install devtoolset-${DEVTOOLSET_VERSION} -DEVTOOLSET_VERSION= - # Used through docker-compose.yml and serves as the default version for the # ci/scripts/install_vcpkg.sh script. Prefer to use short SHAs to keep the # docker tags more readable. diff --git a/.github/dependabot.yml b/.github/dependabot.yml index e96cb8d2eb1e3..7d9ff2f42e887 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,30 +23,35 @@ updates: interval: "weekly" commit-message: prefix: "MINOR: [CI] " + open-pull-requests-limit: 10 - package-ecosystem: "gomod" directory: "/go/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Go] " + open-pull-requests-limit: 10 - package-ecosystem: "maven" directory: "/java/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [Java] " + open-pull-requests-limit: 10 - package-ecosystem: "npm" directory: "/js/" schedule: interval: "monthly" commit-message: prefix: "MINOR: [JS] " + open-pull-requests-limit: 10 - package-ecosystem: "nuget" directory: "/csharp/" schedule: interval: "weekly" commit-message: prefix: "MINOR: [C#] " + open-pull-requests-limit: 10 ignore: - dependency-name: "Microsoft.Extensions.*" update-types: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index fe49e275d908d..36a0dc014db8d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -32,12 +32,12 @@ env: jobs: complete: - name: AMD64 Ubuntu 22.04 Complete Documentation + name: AMD64 Debian 12 Complete Documentation runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 150 env: - UBUNTU: "22.04" + JDK: 17 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -50,8 +50,8 @@ jobs: uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 with: path: .docker - key: ubuntu-docs-${{ hashFiles('cpp/**') }} - restore-keys: ubuntu-docs- + key: debian-docs-${{ hashFiles('cpp/**') }} + restore-keys: debian-docs- - name: Setup Python uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: @@ -62,7 +62,8 @@ jobs: env: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} - run: archery docker run ubuntu-docs + JDK: 17 + run: archery docker run debian-docs - name: Docker Push if: >- success() && @@ -73,4 +74,4 @@ jobs: ARCHERY_DOCKER_USER: ${{ secrets.DOCKERHUB_USER }} ARCHERY_DOCKER_PASSWORD: ${{ secrets.DOCKERHUB_TOKEN }} continue-on-error: true - run: archery docker push ubuntu-docs + run: archery docker push debian-docs diff --git a/.github/workflows/docs_light.yml b/.github/workflows/docs_light.yml index 376c87651d2d0..947e2ac21b83c 100644 --- a/.github/workflows/docs_light.yml +++ b/.github/workflows/docs_light.yml @@ -31,7 +31,7 @@ on: permissions: contents: read - + env: ARCHERY_DEBUG: 1 ARCHERY_USE_DOCKER_CLI: 1 diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 6e09ad61480a6..f53f4aeb505d2 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -75,6 +75,11 @@ jobs: with: repository: apache/arrow-rs path: rust + - name: Checkout Arrow nanoarrow + uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 + with: + repository: apache/arrow-nanoarrow + path: nanoarrow - name: Free up disk space run: | ci/scripts/util_free_space.sh @@ -97,6 +102,7 @@ jobs: run: > archery docker run \ -e ARCHERY_DEFAULT_BRANCH=${{ github.event.repository.default_branch }} \ + -e ARCHERY_INTEGRATION_WITH_NANOARROW=1 \ -e ARCHERY_INTEGRATION_WITH_RUST=1 \ conda-integration - name: Docker Push diff --git a/.github/workflows/matlab.yml b/.github/workflows/matlab.yml index aa3692e587961..ca8280927f4a5 100644 --- a/.github/workflows/matlab.yml +++ b/.github/workflows/matlab.yml @@ -70,7 +70,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: sudo apt-get install ccache - name: Setup ccache @@ -96,10 +96,18 @@ jobs: uses: matlab-actions/run-tests@v2 with: select-by-folder: matlab/test + strict: true macos: - name: AMD64 macOS 12 MATLAB - runs-on: macos-12 + name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} MATLAB + runs-on: macos-${{ matrix.macos-version }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} + strategy: + matrix: + include: + - architecture: AMD64 + macos-version: "12" + - architecture: ARM64 + macos-version: "14" steps: - name: Check out repository uses: actions/checkout@v4 @@ -110,7 +118,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install ccache run: brew install ccache - name: Setup ccache @@ -135,7 +143,8 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true windows: name: AMD64 Windows 2022 MATLAB runs-on: windows-2022 @@ -148,7 +157,7 @@ jobs: - name: Install MATLAB uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Download Timezone Database shell: bash run: ci/scripts/download_tz_database.sh @@ -181,4 +190,5 @@ jobs: MATLABPATH: matlab/install/arrow_matlab uses: matlab-actions/run-tests@v2 with: - select-by-folder: matlab/test + select-by-folder: matlab/test + strict: true diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 15056961f8cf4..a568f8346e7fc 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -182,6 +182,19 @@ jobs: python -m pip install \ -r python/requirements-build.txt \ -r python/requirements-test.txt + - name: Setup ccache + shell: bash + run: ci/scripts/ccache_setup.sh + - name: ccache info + id: ccache-info + shell: bash + run: echo "cache-dir=$(ccache --get-config cache_dir)" >> $GITHUB_OUTPUT + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ${{ steps.ccache-info.outputs.cache-dir }} + key: python-ccache-macos-${{ matrix.macos-version }}-${{ hashFiles('cpp/**', 'python/**') }} + restore-keys: python-ccache-macos-${{ matrix.macos-version }}- - name: Build shell: bash run: | diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml index 8228aaad7ce37..aba77347659cd 100644 --- a/.github/workflows/r.yml +++ b/.github/workflows/r.yml @@ -192,12 +192,11 @@ jobs: fail-fast: false matrix: config: - - { org: "rhub", image: "ubuntu-gcc12", tag: "latest", devtoolset: "" } + - { org: "rhub", image: "ubuntu-gcc12", tag: "latest" } env: R_ORG: ${{ matrix.config.org }} R_IMAGE: ${{ matrix.config.image }} R_TAG: ${{ matrix.config.tag }} - DEVTOOLSET_VERSION: ${{ matrix.config.devtoolset }} steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2e598e0a95064..7dcc1c9816d12 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -132,3 +132,15 @@ repos: ?^cpp/cmake_modules/UseCython\.cmake$| ?^cpp/src/arrow/util/config\.h\.cmake$| ) + - repo: https://github.com/sphinx-contrib/sphinx-lint + rev: v0.9.1 + hooks: + - id: sphinx-lint + files: ^docs/source + exclude: ^docs/source/python/generated + args: [ + '--enable', + 'all', + '--disable', + 'dangling-hyphen,line-too-long', + ] diff --git a/c_glib/meson.build b/c_glib/meson.build index 16a5ea7ccb432..08a9cd182e02e 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -26,8 +26,6 @@ project('arrow-glib', 'c', 'cpp', # Debian: # https://packages.debian.org/search?keywords=meson # - # * bullseye: 0.56.2 - # * bullseye-backports:1.0.0 # * bookworm: 1.0.0 # # Ubuntu: diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index 8cfa67c437264..f688fbb63a9ad 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -129,7 +129,6 @@ set PYARROW_WITH_ORC=%ARROW_ORC% set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% -set PYARROW_WITH_STATIC_BOOST=ON set PYARROW_WITH_SUBSTRAIT=ON set ARROW_HOME=%CONDA_PREFIX%\Library diff --git a/ci/conda_env_sphinx.txt b/ci/conda_env_sphinx.txt index 0a356d5722c42..4665a32e24bbe 100644 --- a/ci/conda_env_sphinx.txt +++ b/ci/conda_env_sphinx.txt @@ -26,7 +26,9 @@ pydata-sphinx-theme=0.14 sphinx-autobuild sphinx-design sphinx-copybutton +sphinx-lint sphinxcontrib-jquery +sphinxcontrib-mermaid sphinx==6.2 # Requirement for doctest-cython # Needs upper pin of 0.3.0, see: diff --git a/ci/docker/debian-12-cpp.dockerfile b/ci/docker/debian-12-cpp.dockerfile index 7036ddf27d52a..d7a6f9df2c2ee 100644 --- a/ci/docker/debian-12-cpp.dockerfile +++ b/ci/docker/debian-12-cpp.dockerfile @@ -119,7 +119,6 @@ ENV ARROW_ACERO=ON \ ARROW_GANDIVA=ON \ ARROW_GCS=ON \ ARROW_HOME=/usr/local \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index ec424b4e6eaa0..1c916840e071b 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -21,18 +21,34 @@ FROM ${base} ARG r=4.4 ARG jdk=8 -# See R install instructions at https://cloud.r-project.org/bin/linux/ubuntu/ +ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium + +# See R install instructions at https://cloud.r-project.org/bin/linux/ RUN apt-get update -y && \ apt-get install -y \ - dirmngr \ apt-transport-https \ - software-properties-common && \ - wget -qO- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | \ - tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc && \ - add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu '$(lsb_release -cs)'-cran40/' && \ + dirmngr \ + gpg \ + lsb-release && \ + gpg --keyserver keyserver.ubuntu.com \ + --recv-key 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 && \ + gpg --export 95C0FAF38DB3CCAD0C080A7BDC78B2DDEABC47B7 | \ + gpg --no-default-keyring \ + --keyring /usr/share/keyrings/cran.gpg \ + --import - && \ + echo "deb [signed-by=/usr/share/keyrings/cran.gpg] https://cloud.r-project.org/bin/linux/$(lsb_release -is | tr 'A-Z' 'a-z') $(lsb_release -cs)-cran40/" | \ + tee /etc/apt/sources.list.d/cran.list && \ + if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ + sed -i \ + -e 's/main$/main contrib non-free non-free-firmware/g' \ + /etc/apt/sources.list.d/debian.sources; \ + fi && \ + apt-get update -y && \ apt-get install -y --no-install-recommends \ autoconf-archive \ automake \ + chromium \ + chromium-sandbox \ curl \ doxygen \ gi-docgen \ @@ -48,6 +64,8 @@ RUN apt-get update -y && \ libxml2-dev \ meson \ ninja-build \ + nodejs \ + npm \ nvidia-cuda-toolkit \ openjdk-${jdk}-jdk-headless \ pandoc \ @@ -55,9 +73,12 @@ RUN apt-get update -y && \ r-base=${r}* \ rsync \ ruby-dev \ + sudo \ wget && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ + PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ + npm install -g yarn @mermaid-js/mermaid-cli ENV JAVA_HOME=/usr/lib/jvm/java-${jdk}-openjdk-amd64 @@ -68,20 +89,6 @@ RUN /arrow/ci/scripts/util_download_apache.sh \ ENV PATH=/opt/apache-maven-${maven}/bin:$PATH RUN mvn -version -ARG node=16 -RUN apt-get purge -y npm && \ - apt-get autoremove -y --purge && \ - wget -q -O - https://deb.nodesource.com/setup_${node}.x | bash - && \ - apt-get install -y nodejs && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - npm install -g yarn - -COPY docs/requirements.txt /arrow/docs/ -RUN python3 -m venv ${ARROW_PYTHON_VENV} && \ - . ${ARROW_PYTHON_VENV}/bin/activate && \ - pip install -r arrow/docs/requirements.txt - COPY c_glib/Gemfile /arrow/c_glib/ RUN gem install --no-document bundler && \ bundle install --gemfile /arrow/c_glib/Gemfile @@ -98,6 +105,17 @@ COPY r/DESCRIPTION /arrow/r/ RUN /arrow/ci/scripts/r_deps.sh /arrow && \ R -e "install.packages('pkgdown')" +RUN useradd --user-group --create-home --groups audio,video arrow +RUN echo "arrow ALL=(ALL:ALL) NOPASSWD:ALL" | \ + EDITOR=tee visudo -f /etc/sudoers.d/arrow +USER arrow + +COPY docs/requirements.txt /arrow/docs/ +RUN sudo chown -R arrow: ${ARROW_PYTHON_VENV} && \ + python3 -m venv ${ARROW_PYTHON_VENV} && \ + . ${ARROW_PYTHON_VENV}/bin/activate && \ + pip install -r arrow/docs/requirements.txt + ENV ARROW_ACERO=ON \ ARROW_AZURE=OFF \ ARROW_BUILD_STATIC=OFF \ diff --git a/ci/docker/linux-apt-r.dockerfile b/ci/docker/linux-apt-r.dockerfile index a68354e3abf8d..630b96e1007b9 100644 --- a/ci/docker/linux-apt-r.dockerfile +++ b/ci/docker/linux-apt-r.dockerfile @@ -113,7 +113,6 @@ ENV \ ARROW_GANDIVA=OFF \ ARROW_HDFS=OFF \ ARROW_JSON=ON \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=OFF \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/linux-r.dockerfile b/ci/docker/linux-r.dockerfile index d368a6629c587..7b7e989adc0d1 100644 --- a/ci/docker/linux-r.dockerfile +++ b/ci/docker/linux-r.dockerfile @@ -27,9 +27,6 @@ ENV R_BIN=${r_bin} ARG r_dev=FALSE ENV ARROW_R_DEV=${r_dev} -ARG devtoolset_version= -ENV DEVTOOLSET_VERSION=${devtoolset_version} - ARG r_prune_deps=FALSE ENV R_PRUNE_DEPS=${r_prune_deps} diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index ae2ba9421cd55..e17c0306f115d 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 124256378b287..d78c7a99cf4d6 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -158,7 +158,6 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index dd887a6d00ceb..341d8a87e8661 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -85,7 +85,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index eb189841cd344..f12e7456add8e 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -196,7 +196,6 @@ ENV absl_SOURCE=BUNDLED \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-24.04-cpp.dockerfile b/ci/docker/ubuntu-24.04-cpp.dockerfile index 4a37818f94396..ecfb5e2f5096d 100644 --- a/ci/docker/ubuntu-24.04-cpp.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp.dockerfile @@ -178,7 +178,6 @@ ENV ARROW_ACERO=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ - ARROW_NO_DEPRECATED_API=ON \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ ARROW_S3=ON \ diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD index b0905886dd50f..f6bbc78be710e 100644 --- a/ci/scripts/PKGBUILD +++ b/ci/scripts/PKGBUILD @@ -18,7 +18,7 @@ _realname=arrow pkgbase=mingw-w64-${_realname} pkgname="${MINGW_PACKAGE_PREFIX}-${_realname}" -pkgver=16.0.0.9000 +pkgver=16.1.0.9000 pkgrel=8000 pkgdesc="Apache Arrow is a cross-language development platform for in-memory data (mingw-w64)" arch=("any") diff --git a/ci/scripts/c_glib_build.sh b/ci/scripts/c_glib_build.sh index c4d2c4fdb5617..6a6295e4ff0bd 100755 --- a/ci/scripts/c_glib_build.sh +++ b/ci/scripts/c_glib_build.sh @@ -30,9 +30,6 @@ with_doc=$([ "${BUILD_DOCS_C_GLIB}" == "ON" ] && echo "true" || echo "false") export PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig -export CFLAGS="-DARROW_NO_DEPRECATED_API" -export CXXFLAGS="-DARROW_NO_DEPRECATED_API" - mkdir -p ${build_dir} # Build with Meson diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index e28ceae8801f0..a1f40fc360e2f 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -152,7 +152,6 @@ else -DARROW_JSON=${ARROW_JSON:-ON} \ -DARROW_LARGE_MEMORY_TESTS=${ARROW_LARGE_MEMORY_TESTS:-OFF} \ -DARROW_MIMALLOC=${ARROW_MIMALLOC:-OFF} \ - -DARROW_NO_DEPRECATED_API=${ARROW_NO_DEPRECATED_API:-OFF} \ -DARROW_ORC=${ARROW_ORC:-OFF} \ -DARROW_PARQUET=${ARROW_PARQUET:-OFF} \ -DARROW_RUNTIME_SIMD_LEVEL=${ARROW_RUNTIME_SIMD_LEVEL:-MAX} \ @@ -229,12 +228,17 @@ find . -name "*.o" -delete popd if [ -x "$(command -v ldconfig)" ]; then - ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} + if [ -x "$(command -v sudo)" ]; then + SUDO=sudo + else + SUDO= + fi + ${SUDO} ldconfig ${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib} fi if [ "${ARROW_USE_CCACHE}" == "ON" ]; then - echo -e "===\n=== ccache statistics after build\n===" - ccache -sv 2>/dev/null || ccache -s + echo -e "===\n=== ccache statistics after build\n===" + ccache -sv 2>/dev/null || ccache -s fi if command -v sccache &> /dev/null; then @@ -244,6 +248,6 @@ fi if [ "${BUILD_DOCS_CPP}" == "ON" ]; then pushd ${source_dir}/apidoc - doxygen + OUTPUT_DIRECTORY=${build_dir}/apidoc doxygen popd fi diff --git a/ci/scripts/install_azurite.sh b/ci/scripts/install_azurite.sh index 2e7008360fdc3..dda5e99405b7f 100755 --- a/ci/scripts/install_azurite.sh +++ b/ci/scripts/install_azurite.sh @@ -19,17 +19,18 @@ set -e +# Pin azurite to 3.29.0 due to https://github.com/apache/arrow/issues/41505 case "$(uname)" in Darwin) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; MINGW*) choco install nodejs.install - npm install -g azurite + npm install -g azurite@v3.29.0 ;; Linux) - npm install -g azurite + npm install -g azurite@v3.29.0 which azurite ;; esac diff --git a/ci/scripts/integration_arrow.sh b/ci/scripts/integration_arrow.sh index a5a012ad2c5c4..2eb58e8dc75ec 100755 --- a/ci/scripts/integration_arrow.sh +++ b/ci/scripts/integration_arrow.sh @@ -40,6 +40,8 @@ if [ "${ARROW_INTEGRATION_JAVA}" == "ON" ]; then pip install jpype1 fi +export ARROW_BUILD_ROOT=${build_dir} + # Get more detailed context on crashes export PYTHONFAULTHANDLER=1 diff --git a/ci/scripts/integration_arrow_build.sh b/ci/scripts/integration_arrow_build.sh index e5c31527aedff..9b54049a2b803 100755 --- a/ci/scripts/integration_arrow_build.sh +++ b/ci/scripts/integration_arrow_build.sh @@ -30,6 +30,8 @@ build_dir=${2} ${arrow_dir}/ci/scripts/rust_build.sh ${arrow_dir} ${build_dir} +${arrow_dir}/ci/scripts/nanoarrow_build.sh ${arrow_dir} ${build_dir} + if [ "${ARROW_INTEGRATION_CPP}" == "ON" ]; then ${arrow_dir}/ci/scripts/cpp_build.sh ${arrow_dir} ${build_dir} fi diff --git a/ci/scripts/java_build.sh b/ci/scripts/java_build.sh index 2103f0329baec..0fa1edab429c0 100755 --- a/ci/scripts/java_build.sh +++ b/ci/scripts/java_build.sh @@ -75,7 +75,16 @@ fi # Use `2 * ncores` threads mvn="${mvn} -T 2C" -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +mkdir -p ${build_dir} +rm -rf ${build_dir}/format +cp -aL ${arrow_dir}/format ${build_dir}/ +rm -rf ${build_dir}/java +cp -aL ${source_dir} ${build_dir}/ +pushd ${build_dir}/java if [ "${ARROW_JAVA_SHADE_FLATBUFFERS}" == "ON" ]; then mvn="${mvn} -Pshade-flatbuffers" @@ -95,7 +104,7 @@ if [ "${BUILD_DOCS_JAVA}" == "ON" ]; then # HTTP pooling is turned of to avoid download issues https://issues.apache.org/jira/browse/ARROW-11633 mkdir -p ${build_dir}/docs/java/reference ${mvn} -Dcheckstyle.skip=true -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false clean install site - rsync -a ${arrow_dir}/java/target/site/apidocs/ ${build_dir}/docs/java/reference + rsync -a target/site/apidocs/ ${build_dir}/docs/java/reference fi popd diff --git a/ci/scripts/java_cdata_integration.sh b/ci/scripts/java_cdata_integration.sh index 86ea7cf155350..0ee5d3026aa09 100755 --- a/ci/scripts/java_cdata_integration.sh +++ b/ci/scripts/java_cdata_integration.sh @@ -20,9 +20,9 @@ set -ex arrow_dir=${1} -export ARROW_SOURCE_DIR=${arrow_dir} +build_dir=${2} -pushd ${arrow_dir}/java/c/src/test/python +pushd ${build_dir}/java/c/src/test/python python integration_tests.py diff --git a/ci/scripts/java_jni_manylinux_build.sh b/ci/scripts/java_jni_manylinux_build.sh index da4987d307ce4..6f3769751af42 100755 --- a/ci/scripts/java_jni_manylinux_build.sh +++ b/ci/scripts/java_jni_manylinux_build.sh @@ -58,7 +58,7 @@ export ARROW_ORC : ${VCPKG_ROOT:=/opt/vcpkg} : ${VCPKG_FEATURE_FLAGS:=-manifests} : ${VCPKG_TARGET_TRIPLET:=${VCPKG_DEFAULT_TRIPLET:-x64-linux-static-${CMAKE_BUILD_TYPE}}} -: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-isystem;-lpthread} +: ${GANDIVA_CXX_FLAGS:=-isystem;${devtoolset_include_cpp};-isystem;${devtoolset_include_cpp}/x86_64-redhat-linux;-lpthread} if [ "${ARROW_USE_CCACHE}" == "ON" ]; then echo "=== ccache statistics before build ===" diff --git a/ci/scripts/js_build.sh b/ci/scripts/js_build.sh index d61f74f0b7ca1..196539ee0f101 100755 --- a/ci/scripts/js_build.sh +++ b/ci/scripts/js_build.sh @@ -25,7 +25,16 @@ build_dir=${2} : ${BUILD_DOCS_JS:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/js +mkdir -p ${build_dir} +cp -aL ${arrow_dir}/LICENSE.txt ${build_dir}/ +cp -aL ${arrow_dir}/NOTICE.txt ${build_dir}/ +cp -aL ${source_dir} ${build_dir}/js +pushd ${build_dir}/js yarn --immutable yarn lint:ci @@ -34,18 +43,18 @@ yarn build if [ "${BUILD_DOCS_JS}" == "ON" ]; then # If apache or upstream are defined use those as remote. # Otherwise use origin which could be a fork on PRs. - if [ "$(git config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then + if [ "$(git -C ${arrow_dir} config --get remote.apache.url)" == "git@github.com:apache/arrow.git" ]; then yarn doc --gitRemote apache - elif [[ "$(git config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then + elif [[ "$(git -C ${arrow_dir}config --get remote.upstream.url)" =~ "https://github.com/apache/arrow" ]]; then yarn doc --gitRemote upstream - elif [[ "$(basename -s .git $(git config --get remote.origin.url))" == "arrow" ]]; then + elif [[ "$(basename -s .git $(git -C ${arrow_dir} config --get remote.origin.url))" == "arrow" ]]; then yarn doc else echo "Failed to build docs because the remote is not set correctly. Please set the origin or upstream remote to https://github.com/apache/arrow.git or the apache remote to git@github.com:apache/arrow.git." exit 0 fi mkdir -p ${build_dir}/docs/js - rsync -a ${arrow_dir}/js/doc/ ${build_dir}/docs/js + rsync -a doc/ ${build_dir}/docs/js fi popd diff --git a/ci/scripts/js_test.sh b/ci/scripts/js_test.sh index 40de974ede161..863b1c3d34613 100755 --- a/ci/scripts/js_test.sh +++ b/ci/scripts/js_test.sh @@ -20,8 +20,9 @@ set -ex source_dir=${1}/js +build_dir=${2}/js -pushd ${source_dir} +pushd ${build_dir} yarn lint yarn test diff --git a/ci/scripts/nanoarrow_build.sh b/ci/scripts/nanoarrow_build.sh new file mode 100755 index 0000000000000..1612b9a2d0102 --- /dev/null +++ b/ci/scripts/nanoarrow_build.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +arrow_dir=${1} +source_dir=${1}/nanoarrow +build_dir=${2}/nanoarrow + +# This file is used to build the nanoarrow binaries needed for the archery +# integration tests. Testing of the nanoarrow implementation in normal CI is handled +# by github workflows in the arrow-nanoarrow repository. + +if [ "${ARCHERY_INTEGRATION_WITH_NANOARROW}" -eq "0" ]; then + echo "=====================================================================" + echo "Not building nanoarrow" + echo "=====================================================================" + exit 0; +elif [ ! -d "${source_dir}" ]; then + echo "=====================================================================" + echo "The nanoarrow source is missing. Please clone the arrow-nanoarrow repository" + echo "to arrow/nanoarrow before running the integration tests:" + echo " git clone https://github.com/apache/arrow-nanoarrow.git path/to/arrow/nanoarrow" + echo "=====================================================================" + exit 1; +fi + +set -x + +mkdir -p ${build_dir} +pushd ${build_dir} + +cmake ${source_dir} -DNANOARROW_BUILD_INTEGRATION_TESTS=ON +cmake --build . + +popd diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index 99153cdf75539..9455baf353633 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -78,17 +78,42 @@ export PYARROW_PARALLEL=${n_jobs} export CMAKE_PREFIX_PATH export LD_LIBRARY_PATH=${ARROW_HOME}/lib:${LD_LIBRARY_PATH} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${python_build_dir} +cp -aL ${source_dir} ${python_build_dir} +pushd ${python_build_dir} # - Cannot call setup.py as it may install in the wrong directory # on Debian/Ubuntu (ARROW-15243). # - Cannot use build isolation as we want to use specific dependency versions # (e.g. Numpy, Pandas) on some CI jobs. ${PYTHON:-python} -m pip install --no-deps --no-build-isolation -vv . -# Remove build artifacts from source directory -find build/ -user root -delete popd if [ "${BUILD_DOCS_PYTHON}" == "ON" ]; then + # https://github.com/apache/arrow/issues/41429 + # TODO: We want to out-of-source build. This is a workaround. + # + # Copy docs/source because the "autosummary_generate = True" + # configuration generates files to docs/source/python/generated/. + rm -rf ${python_build_dir}/docs/source + mkdir -p ${python_build_dir}/docs + cp -a ${arrow_dir}/docs/source ${python_build_dir}/docs/ + rm -rf ${python_build_dir}/format + cp -a ${arrow_dir}/format ${python_build_dir}/ + rm -rf ${python_build_dir}/cpp/examples + mkdir -p ${python_build_dir}/cpp + cp -a ${arrow_dir}/cpp/examples ${python_build_dir}/cpp/ + rm -rf ${python_build_dir}/ci + cp -a ${arrow_dir}/ci/ ${python_build_dir}/ ncpus=$(python -c "import os; print(os.cpu_count())") - sphinx-build -b html -j ${ncpus} ${arrow_dir}/docs/source ${build_dir}/docs + export ARROW_CPP_DOXYGEN_XML=${build_dir}/cpp/apidoc/xml + pushd ${build_dir} + sphinx-build \ + -b html \ + ${python_build_dir}/docs/source \ + ${build_dir}/docs + popd fi diff --git a/ci/scripts/r_build.sh b/ci/scripts/r_build.sh index 38b54e4434036..f4dc5a5781c6e 100755 --- a/ci/scripts/r_build.sh +++ b/ci/scripts/r_build.sh @@ -24,15 +24,29 @@ build_dir=${2} : ${BUILD_DOCS_R:=OFF} -pushd ${source_dir} +# https://github.com/apache/arrow/issues/41429 +# TODO: We want to out-of-source build. This is a workaround. We copy +# all needed files to the build directory from the source directory +# and build in the build directory. +rm -rf ${build_dir}/r +cp -aL ${source_dir} ${build_dir}/r +pushd ${build_dir}/r # build first so that any stray compiled files in r/src are ignored ${R_BIN} CMD build . -${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz +if [ -x "$(command -v sudo)" ]; then + SUDO=sudo +else + SUDO= +fi +${SUDO} \ + env \ + PKG_CONFIG_PATH=${ARROW_HOME}/lib/pkgconfig:${PKG_CONFIG_PATH} \ + ${R_BIN} CMD INSTALL ${INSTALL_ARGS} arrow*.tar.gz if [ "${BUILD_DOCS_R}" == "ON" ]; then ${R_BIN} -e "pkgdown::build_site(install = FALSE)" - rsync -a ${source_dir}/docs/ ${build_dir}/docs/r + rsync -a docs/ ${build_dir}/docs/r fi popd diff --git a/ci/scripts/r_docker_configure.sh b/ci/scripts/r_docker_configure.sh index 52db2e6df6611..8a962fe576cbb 100755 --- a/ci/scripts/r_docker_configure.sh +++ b/ci/scripts/r_docker_configure.sh @@ -67,26 +67,6 @@ sloppiness = include_file_ctime hash_dir = false" >> ~/.ccache/ccache.conf fi -# Special hacking to try to reproduce quirks on centos using non-default build -# tooling. -if [[ -n "$DEVTOOLSET_VERSION" ]]; then - $PACKAGE_MANAGER install -y centos-release-scl - $PACKAGE_MANAGER install -y "devtoolset-$DEVTOOLSET_VERSION" - - # Enable devtoolset here so that `which gcc` finds the right compiler below - source /opt/rh/devtoolset-${DEVTOOLSET_VERSION}/enable - - # Build images which require the devtoolset don't have CXX17 variables - # set as the system compiler doesn't support C++17 - if [ ! "`{R_BIN} CMD config CXX17`" ]; then - mkdir -p ~/.R - echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars - echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars - echo "CXX17STD = -std=c++17" >> ~/.R/Makevars - echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars - fi -fi - if [ -f "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" ]; then "${ARROW_SOURCE_HOME}/ci/scripts/r_install_system_dependencies.sh" fi diff --git a/ci/scripts/r_sanitize.sh b/ci/scripts/r_sanitize.sh index f7ed07f0c864b..fb3e9a5836387 100755 --- a/ci/scripts/r_sanitize.sh +++ b/ci/scripts/r_sanitize.sh @@ -46,6 +46,8 @@ unset ARROW_R_DEV export ARROW_R_VERBOSE_TEST=TRUE export UBSAN_OPTIONS="print_stacktrace=1,suppressions=/arrow/r/tools/ubsan.supp" +# From the old rhub image https://github.com/r-hub/rhub-linux-builders/blob/master/fedora-clang-devel-san/Dockerfile +export ASAN_OPTIONS="alloc_dealloc_mismatch=0:detect_leaks=0:detect_odr_violation=0" # run tests pushd tests diff --git a/ci/scripts/r_test.sh b/ci/scripts/r_test.sh index 95a49ee83a79b..e13da45e2d296 100755 --- a/ci/scripts/r_test.sh +++ b/ci/scripts/r_test.sh @@ -48,9 +48,6 @@ if [ "$ARROW_USE_PKG_CONFIG" != "false" ]; then fi export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} ${ARROW_R_CXXFLAGS}" -# These should generally be picked up, but are slightly wrong in rhub's containers it appears -# https://github.com/r-hub/containers/pull/63 -export _R_CHECK_COMPILATION_FLAGS_KNOWN_="${_R_CHECK_COMPILATION_FLAGS_KNOWN_} -Wno-parentheses -Werror=format-security -Wp,-D_FORTIFY_SOURCE=3" if [ "$ARROW_R_DEV" = "TRUE" ]; then # These are sometimes used in the Arrow C++ build and are not a problem diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index df83f56dd2f70..679842c31e0b1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -431,10 +431,6 @@ endif() # Compiler flags # -if(ARROW_NO_DEPRECATED_API) - add_definitions(-DARROW_NO_DEPRECATED_API) -endif() - if(ARROW_EXTRA_ERROR_CONTEXT) add_definitions(-DARROW_EXTRA_ERROR_CONTEXT) endif() diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake index dc0e5da63adb7..41466a1c22404 100644 --- a/cpp/cmake_modules/DefineOptions.cmake +++ b/cpp/cmake_modules/DefineOptions.cmake @@ -158,8 +158,6 @@ if(ARROW_DEFINE_OPTIONS) define_option_string(ARROW_GIT_DESCRIPTION "The Arrow git commit description (if any)" "") - define_option(ARROW_NO_DEPRECATED_API "Exclude deprecated APIs from build" OFF) - define_option(ARROW_POSITION_INDEPENDENT_CODE "Whether to create position-independent target" ON) diff --git a/cpp/cmake_modules/FindProtobufAlt.cmake b/cpp/cmake_modules/FindProtobufAlt.cmake index f343b42f2b762..703e05c4731b6 100644 --- a/cpp/cmake_modules/FindProtobufAlt.cmake +++ b/cpp/cmake_modules/FindProtobufAlt.cmake @@ -31,6 +31,11 @@ endif() find_package(protobuf CONFIG ${find_package_args}) set(ProtobufAlt_FOUND ${protobuf_FOUND}) if(ProtobufAlt_FOUND) + if(Protobuf_PROTOC_EXECUTABLE) + # work around https://github.com/protocolbuffers/protobuf/issues/14576 + set_target_properties(protobuf::protoc PROPERTIES IMPORTED_LOCATION_RELEASE + "${Protobuf_PROTOC_EXECUTABLE}") + endif() set(ProtobufAlt_VERSION ${protobuf_VERSION}) set(ProtobufAlt_VERSION_MAJOR ${protobuf_VERSION_MAJOR}) set(ProtobufAlt_VERSION_MINOR ${protobuf_VERSION_MINOR}) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index ddea1c399cbba..c24442dcb8749 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -1263,7 +1263,7 @@ endif() # - S3FS and Flight benchmarks need Boost at runtime. if(ARROW_BUILD_INTEGRATION OR ARROW_BUILD_TESTS - OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS) + OR (ARROW_FLIGHT AND (ARROW_TESTING OR ARROW_BUILD_BENCHMARKS)) OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)) set(ARROW_USE_BOOST TRUE) set(ARROW_BOOST_REQUIRE_LIBRARY TRUE) @@ -4522,7 +4522,7 @@ macro(build_orc) "-DSNAPPY_HOME=${ORC_SNAPPY_ROOT}" "-DSNAPPY_LIBRARY=$" "-DLZ4_LIBRARY=$" - "-DLZ4_STATIC_LIBRARY=$" + "-DLZ4_STATIC_LIB=$" "-DLZ4_INCLUDE_DIR=${ORC_LZ4_ROOT}/include" "-DSNAPPY_INCLUDE_DIR=${ORC_SNAPPY_INCLUDE_DIR}" "-DZSTD_HOME=${ORC_ZSTD_ROOT}" diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2ef82dd614f84..0f4824ec99daa 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -501,6 +501,7 @@ set(ARROW_UTIL_SRCS util/decimal.cc util/delimiting.cc util/dict_util.cc + util/fixed_width_internal.cc util/float16.cc util/formatting.cc util/future.cc @@ -715,7 +716,8 @@ set(ARROW_COMPUTE_SRCS compute/row/compare_internal.cc compute/row/grouper.cc compute/row/row_internal.cc - compute/util.cc) + compute/util.cc + compute/util_internal.cc) append_runtime_avx2_src(ARROW_COMPUTE_SRCS compute/key_hash_internal_avx2.cc) append_runtime_avx2_bmi2_src(ARROW_COMPUTE_SRCS compute/key_map_internal_avx2.cc) diff --git a/cpp/src/arrow/acero/asof_join_node.cc b/cpp/src/arrow/acero/asof_join_node.cc index 48cc83dd3d6a9..1d94467df9ee2 100644 --- a/cpp/src/arrow/acero/asof_join_node.cc +++ b/cpp/src/arrow/acero/asof_join_node.cc @@ -548,8 +548,10 @@ class InputState { // true when the queue is empty and, when memo may have future entries (the case of a // positive tolerance), when the memo is empty. // used when checking whether RHS is up to date with LHS. - bool CurrentEmpty() const { - return memo_.no_future_ ? Empty() : memo_.times_.empty() && Empty(); + // NOTE: The emptiness must be decided by a single call to Empty() in caller, due to the + // potential race with Push(), see GH-41614. + bool CurrentEmpty(bool empty) const { + return memo_.no_future_ ? empty : (memo_.times_.empty() && empty); } // in case memo may not have future entries (the case of a non-positive tolerance), @@ -650,13 +652,15 @@ class InputState { // timestamp, update latest_time and latest_ref_row to the value that immediately pass // the horizon. Update the memo-store with any entries or future entries so observed. // Returns true if updates were made, false if not. - Result AdvanceAndMemoize(OnType ts) { + // NOTE: The emptiness must be decided by a single call to Empty() in caller, due to the + // potential race with Push(), see GH-41614. + Result AdvanceAndMemoize(OnType ts, bool empty) { // Advance the right side row index until we reach the latest right row (for each key) // for the given left timestamp. DEBUG_SYNC(node_, "Advancing input ", index_, DEBUG_MANIP(std::endl)); // Check if already updated for TS (or if there is no latest) - if (Empty()) { // can't advance if empty and no future entries + if (empty) { // can't advance if empty and no future entries return memo_.no_future_ ? false : memo_.RemoveEntriesWithLesserTime(ts); } @@ -918,34 +922,46 @@ class CompositeTableBuilder { // guaranteeing this probability is below 1 in a billion. The fix is 128-bit hashing. // See ARROW-17653 class AsofJoinNode : public ExecNode { - // Advances the RHS as far as possible to be up to date for the current LHS timestamp - Result UpdateRhs() { + // A simple wrapper for the result of a single call to UpdateRhs(), identifying: + // 1) If any RHS has advanced. + // 2) If all RHS are up to date with LHS. + struct RhsUpdateState { + bool any_advanced; + bool all_up_to_date_with_lhs; + }; + // Advances the RHS as far as possible to be up to date for the current LHS timestamp, + // and checks if all RHS are up to date with LHS. The reason they have to be performed + // together is that they both depend on the emptiness of the RHS, which can be changed + // by Push() executing in another thread. + Result UpdateRhs() { auto& lhs = *state_.at(0); auto lhs_latest_time = lhs.GetLatestTime(); - bool any_updated = false; - for (size_t i = 1; i < state_.size(); ++i) { - ARROW_ASSIGN_OR_RAISE(bool advanced, state_[i]->AdvanceAndMemoize(lhs_latest_time)); - any_updated |= advanced; - } - return any_updated; - } - - // Returns false if RHS not up to date for LHS - bool IsUpToDateWithLhsRow() const { - auto& lhs = *state_[0]; - if (lhs.Empty()) return false; // can't proceed if nothing on the LHS - OnType lhs_ts = lhs.GetLatestTime(); + RhsUpdateState update_state{/*any_advanced=*/false, /*all_up_to_date_with_lhs=*/true}; for (size_t i = 1; i < state_.size(); ++i) { auto& rhs = *state_[i]; - if (!rhs.Finished()) { + + // Obtain RHS emptiness once for subsequent AdvanceAndMemoize() and CurrentEmpty(). + bool rhs_empty = rhs.Empty(); + // Obtain RHS current time here because AdvanceAndMemoize() can change the + // emptiness. + OnType rhs_current_time = rhs_empty ? OnType{} : rhs.GetLatestTime(); + + ARROW_ASSIGN_OR_RAISE(bool advanced, + rhs.AdvanceAndMemoize(lhs_latest_time, rhs_empty)); + update_state.any_advanced |= advanced; + + if (update_state.all_up_to_date_with_lhs && !rhs.Finished()) { // If RHS is finished, then we know it's up to date - if (rhs.CurrentEmpty()) - return false; // RHS isn't finished, but is empty --> not up to date - if (lhs_ts > rhs.GetCurrentTime()) - return false; // RHS isn't up to date (and not finished) + if (rhs.CurrentEmpty(rhs_empty)) { + // RHS isn't finished, but is empty --> not up to date + update_state.all_up_to_date_with_lhs = false; + } else if (lhs_latest_time > rhs_current_time) { + // RHS isn't up to date (and not finished) + update_state.all_up_to_date_with_lhs = false; + } } } - return true; + return update_state; } Result> ProcessInner() { @@ -963,20 +979,19 @@ class AsofJoinNode : public ExecNode { // If LHS is finished or empty then there's nothing we can do here if (lhs.Finished() || lhs.Empty()) break; - // Advance each of the RHS as far as possible to be up to date for the LHS timestamp - ARROW_ASSIGN_OR_RAISE(bool any_rhs_advanced, UpdateRhs()); + ARROW_ASSIGN_OR_RAISE(auto rhs_update_state, UpdateRhs()); // If we have received enough inputs to produce the next output batch // (decided by IsUpToDateWithLhsRow), we will perform the join and // materialize the output batch. The join is done by advancing through // the LHS and adding joined row to rows_ (done by Emplace). Finally, // input batches that are no longer needed are removed to free up memory. - if (IsUpToDateWithLhsRow()) { + if (rhs_update_state.all_up_to_date_with_lhs) { dst.Emplace(state_, tolerance_); ARROW_ASSIGN_OR_RAISE(bool advanced, lhs.Advance()); if (!advanced) break; // if we can't advance LHS, we're done for this batch } else { - if (!any_rhs_advanced) break; // need to wait for new data + if (!rhs_update_state.any_advanced) break; // need to wait for new data } } diff --git a/cpp/src/arrow/acero/asof_join_node_test.cc b/cpp/src/arrow/acero/asof_join_node_test.cc index d95d2aaad3643..051e280a4c53c 100644 --- a/cpp/src/arrow/acero/asof_join_node_test.cc +++ b/cpp/src/arrow/acero/asof_join_node_test.cc @@ -1678,5 +1678,59 @@ TEST(AsofJoinTest, BackpressureWithBatchesGen) { /*slow_r0=*/false); } +// Reproduction of GH-40675: A logical race between Process() and Push() that can be more +// easily observed with single small batch. +TEST(AsofJoinTest, RhsEmptinessRace) { + auto left_batch = ExecBatchFromJSON( + {int64(), utf8()}, R"([[1, "a"], [1, "b"], [5, "a"], [6, "b"], [7, "f"]])"); + auto right_batch = ExecBatchFromJSON( + {int64(), utf8(), float64()}, R"([[2, "a", 1.0], [9, "b", 3.0], [15, "g", 5.0]])"); + + Declaration left{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colA", int64()), field("col2", utf8())}), + {std::move(left_batch)})}; + Declaration right{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colB", int64()), field("col3", utf8()), + field("colC", float64())}), + {std::move(right_batch)})}; + AsofJoinNodeOptions asof_join_opts({{{"colA"}, {{"col2"}}}, {{"colB"}, {{"col3"}}}}, 1); + Declaration asof_join{ + "asofjoin", {std::move(left), std::move(right)}, std::move(asof_join_opts)}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(asof_join))); + + auto exp_batch = ExecBatchFromJSON( + {int64(), utf8(), float64()}, + R"([[1, "a", 1.0], [1, "b", null], [5, "a", null], [6, "b", null], [7, "f", null]])"); + AssertExecBatchesEqualIgnoringOrder(result.schema, {exp_batch}, result.batches); +} + +// Reproduction of GH-41149: Another case of the same root cause as GH-40675, but with +// empty "by" columns. +TEST(AsofJoinTest, RhsEmptinessRaceEmptyBy) { + auto left_batch = ExecBatchFromJSON({int64()}, R"([[1], [2], [3]])"); + auto right_batch = + ExecBatchFromJSON({utf8(), int64()}, R"([["Z", 2], ["B", 3], ["A", 4]])"); + + Declaration left{"exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("on", int64())}), + {std::move(left_batch)})}; + Declaration right{ + "exec_batch_source", + ExecBatchSourceNodeOptions(schema({field("colVals", utf8()), field("on", int64())}), + {std::move(right_batch)})}; + AsofJoinNodeOptions asof_join_opts({{{"on"}, {}}, {{"on"}, {}}}, 1); + Declaration asof_join{ + "asofjoin", {std::move(left), std::move(right)}, std::move(asof_join_opts)}; + + ASSERT_OK_AND_ASSIGN(auto result, DeclarationToExecBatches(std::move(asof_join))); + + auto exp_batch = + ExecBatchFromJSON({int64(), utf8()}, R"([[1, "Z"], [2, "Z"], [3, "B"]])"); + AssertExecBatchesEqualIgnoringOrder(result.schema, {exp_batch}, result.batches); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/exec_plan.cc b/cpp/src/arrow/acero/exec_plan.cc index 97119726d4b17..d9fb1942fccd8 100644 --- a/cpp/src/arrow/acero/exec_plan.cc +++ b/cpp/src/arrow/acero/exec_plan.cc @@ -128,7 +128,7 @@ struct ExecPlanImpl : public ExecPlan { Future<> scheduler_finished = arrow::util::AsyncTaskScheduler::Make( [this](arrow::util::AsyncTaskScheduler* async_scheduler) { QueryContext* ctx = query_context(); - RETURN_NOT_OK(ctx->Init(ctx->max_concurrency(), async_scheduler)); + RETURN_NOT_OK(ctx->Init(async_scheduler)); #ifdef ARROW_WITH_OPENTELEMETRY if (HasMetadata()) { diff --git a/cpp/src/arrow/acero/hash_aggregate_test.cc b/cpp/src/arrow/acero/hash_aggregate_test.cc index 2626fd50379dd..d529f443319b9 100644 --- a/cpp/src/arrow/acero/hash_aggregate_test.cc +++ b/cpp/src/arrow/acero/hash_aggregate_test.cc @@ -592,6 +592,12 @@ void TestSegments(std::unique_ptr& segmenter, const ExecSpan& batc ASSERT_EQ(expected_segment, segment); offset = segment.offset + segment.length; } + // Assert next is the last (empty) segment. + ASSERT_OK_AND_ASSIGN(auto segment, segmenter->GetNextSegment(batch, offset)); + ASSERT_GE(segment.offset, batch.length); + ASSERT_EQ(segment.length, 0); + ASSERT_TRUE(segment.is_open); + ASSERT_TRUE(segment.extends); } Result> MakeGrouper(const std::vector& key_types) { @@ -682,48 +688,142 @@ TEST(RowSegmenter, Basics) { } TEST(RowSegmenter, NonOrdered) { - std::vector types = {int32()}; - auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 1, true, false}, - {5, 0, true, true}}); + { + std::vector types = {int32()}; + auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [1], [2]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 1, true, false}, + {5, 0, true, true}}); + } + { + std::vector types = {int32(), int32()}; + auto batch = ExecBatchFromJSON(types, "[[1, 1], [1, 1], [2, 2], [1, 2], [2, 2]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 1, true, false}, + {5, 0, true, true}}); + } } TEST(RowSegmenter, EmptyBatches) { - std::vector types = {int32()}; - std::vector batches = { - ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), - ExecBatchFromJSON(types, "[[2], [2]]"), ExecBatchFromJSON(types, "[]"), - }; - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batches[0]), {}); - TestSegments(segmenter, ExecSpan(batches[1]), {}); - TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[3]), {}); - TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); - TestSegments(segmenter, ExecSpan(batches[5]), {}); - TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); - TestSegments(segmenter, ExecSpan(batches[7]), {}); + { + std::vector types = {int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[2], [2]]"), ExecBatchFromJSON(types, "[]"), + }; + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {}); + TestSegments(segmenter, ExecSpan(batches[1]), {}); + TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[3]), {}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[5]), {}); + TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[7]), {}); + } + { + std::vector types = {int32(), int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[]"), + ExecBatchFromJSON(types, "[[2, 2], [2, 2]]"), + ExecBatchFromJSON(types, "[]"), + }; + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {}); + TestSegments(segmenter, ExecSpan(batches[1]), {}); + TestSegments(segmenter, ExecSpan(batches[2]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[3]), {}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[5]), {}); + TestSegments(segmenter, ExecSpan(batches[6]), {{0, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[7]), {}); + } } TEST(RowSegmenter, MultipleSegments) { - std::vector types = {int32()}; - auto batch = ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], [4]]"); - ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); - TestSegments(segmenter, ExecSpan(batch), - {{0, 2, false, true}, - {2, 1, false, false}, - {3, 1, false, false}, - {4, 2, false, false}, - {6, 2, false, false}, - {8, 1, true, false}, - {9, 0, true, true}}); + { + std::vector types = {int32()}; + auto batch = + ExecBatchFromJSON(types, "[[1], [1], [2], [5], [3], [3], [5], [5], [4]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 2, false, false}, + {6, 2, false, false}, + {8, 1, true, false}, + {9, 0, true, true}}); + } + { + std::vector types = {int32(), int32()}; + auto batch = ExecBatchFromJSON( + types, + "[[1, 1], [1, 1], [2, 2], [5, 5], [3, 3], [3, 3], [5, 5], [5, 5], [4, 4]]"); + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batch), + {{0, 2, false, true}, + {2, 1, false, false}, + {3, 1, false, false}, + {4, 2, false, false}, + {6, 2, false, false}, + {8, 1, true, false}, + {9, 0, true, true}}); + } +} + +TEST(RowSegmenter, MultipleSegmentsMultipleBatches) { + { + std::vector types = {int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[[1]]"), ExecBatchFromJSON(types, "[[1], [2]]"), + ExecBatchFromJSON(types, "[[5], [3]]"), + ExecBatchFromJSON(types, "[[3], [5], [5]]"), ExecBatchFromJSON(types, "[[4]]")}; + + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[1]), + {{0, 1, false, true}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[2]), + {{0, 1, false, false}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[3]), + {{0, 1, false, true}, {1, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); + } + { + std::vector types = {int32(), int32()}; + std::vector batches = { + ExecBatchFromJSON(types, "[[1, 1]]"), + ExecBatchFromJSON(types, "[[1, 1], [2, 2]]"), + ExecBatchFromJSON(types, "[[5, 5], [3, 3]]"), + ExecBatchFromJSON(types, "[[3, 3], [5, 5], [5, 5]]"), + ExecBatchFromJSON(types, "[[4, 4]]")}; + + ASSERT_OK_AND_ASSIGN(auto segmenter, MakeRowSegmenter(types)); + TestSegments(segmenter, ExecSpan(batches[0]), {{0, 1, true, true}}); + TestSegments(segmenter, ExecSpan(batches[1]), + {{0, 1, false, true}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[2]), + {{0, 1, false, false}, {1, 1, true, false}}); + TestSegments(segmenter, ExecSpan(batches[3]), + {{0, 1, false, true}, {1, 2, true, false}}); + TestSegments(segmenter, ExecSpan(batches[4]), {{0, 1, true, false}}); + } } namespace { diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index b49364300dac8..06405f16c8d4c 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -497,11 +497,11 @@ struct BloomFilterPushdownContext { using BuildFinishedCallback = std::function; using FiltersReceivedCallback = std::function; using FilterFinishedCallback = std::function; - void Init(HashJoinNode* owner, size_t num_threads, - RegisterTaskGroupCallback register_task_group_callback, - StartTaskGroupCallback start_task_group_callback, - FiltersReceivedCallback on_bloom_filters_received, bool disable_bloom_filter, - bool use_sync_execution); + Status Init(HashJoinNode* owner, size_t num_threads, + RegisterTaskGroupCallback register_task_group_callback, + StartTaskGroupCallback start_task_group_callback, + FiltersReceivedCallback on_bloom_filters_received, + bool disable_bloom_filter, bool use_sync_execution); Status StartProducing(size_t thread_index); @@ -559,8 +559,7 @@ struct BloomFilterPushdownContext { std::vector hashes(batch.length); std::vector bv(bit_vector_bytes); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* stack = &tld_[thread_index].stack; // Start with full selection for the current batch memset(selected.data(), 0xff, bit_vector_bytes); @@ -654,7 +653,17 @@ struct BloomFilterPushdownContext { FiltersReceivedCallback all_received_callback_; FilterFinishedCallback on_finished_; } eval_; + + static constexpr auto kTempStackUsage = + Hashing32::kHashBatchTempStackUsage + + (sizeof(uint32_t) + /*extra=*/1) * arrow::util::MiniBatch::kMiniBatchLength; + + struct ThreadLocalData { + arrow::util::TempVectorStack stack; + }; + std::vector tld_; }; + bool HashJoinSchema::HasDictionaries() const { for (int side = 0; side <= 1; ++side) { for (int icol = 0; icol < proj_maps[side].num_cols(HashJoinProjection::INPUT); @@ -930,7 +939,7 @@ class HashJoinNode : public ExecNode, public TracedNode { // we will change it back to just the CPU's thread pool capacity. size_t num_threads = (GetCpuThreadPoolCapacity() + io::GetIOThreadPoolCapacity() + 1); - pushdown_context_.Init( + RETURN_NOT_OK(pushdown_context_.Init( this, num_threads, [ctx](std::function fn, std::function on_finished) { @@ -940,7 +949,7 @@ class HashJoinNode : public ExecNode, public TracedNode { return ctx->StartTaskGroup(task_group_id, num_tasks); }, [this](size_t thread_index) { return OnFiltersReceived(thread_index); }, - disable_bloom_filter_, use_sync_execution); + disable_bloom_filter_, use_sync_execution)); RETURN_NOT_OK(impl_->Init( ctx, join_type_, num_threads, &(schema_mgr_->proj_maps[0]), @@ -1037,7 +1046,7 @@ class HashJoinNode : public ExecNode, public TracedNode { BloomFilterPushdownContext pushdown_context_; }; -void BloomFilterPushdownContext::Init( +Status BloomFilterPushdownContext::Init( HashJoinNode* owner, size_t num_threads, RegisterTaskGroupCallback register_task_group_callback, StartTaskGroupCallback start_task_group_callback, @@ -1074,6 +1083,12 @@ void BloomFilterPushdownContext::Init( return eval_.on_finished_(thread_index, std::move(eval_.batches_)); }); start_task_group_callback_ = std::move(start_task_group_callback); + tld_.resize(num_threads); + for (auto& local_data : tld_) { + RETURN_NOT_OK(local_data.stack.Init(ctx_->memory_pool(), kTempStackUsage)); + } + + return Status::OK(); } Status BloomFilterPushdownContext::StartProducing(size_t thread_index) { @@ -1124,8 +1139,7 @@ Status BloomFilterPushdownContext::BuildBloomFilter_exec_task(size_t thread_inde } ARROW_ASSIGN_OR_RAISE(ExecBatch key_batch, ExecBatch::Make(std::move(key_columns))); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* stack = &tld_[thread_index].stack; arrow::util::TempVectorHolder hash_holder( stack, arrow::util::MiniBatch::kMiniBatchLength); uint32_t* hashes = hash_holder.mutable_data(); diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 9c3dbc176ff4f..215b1e4d21125 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -28,6 +28,7 @@ #include "arrow/api.h" #include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/kernels/test_util.h" +#include "arrow/compute/light_array_internal.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" @@ -41,6 +42,7 @@ namespace arrow { using compute::call; using compute::default_exec_context; +using compute::ExecBatchBuilder; using compute::ExecSpan; using compute::field_ref; using compute::SortIndices; @@ -3201,5 +3203,55 @@ TEST(HashJoin, ChainedIntegerHashJoins) { } } +// Test that a large number of joins don't overflow the temp vector stack, like GH-39582 +// and GH-39951. +TEST(HashJoin, ManyJoins) { + // The idea of this case is to create many nested join nodes that may possibly cause + // recursive usage of temp vector stack. To make sure that the recursion happens: + // 1. A left-deep join tree is created so that the left-most (the final probe side) + // table will go through all the hash tables from the right side. + // 2. Left-outer join is used so that every join will increase the cardinality. + // 3. The left-most table contains rows of unique integers from 0 to N. + // 4. Each right table at level i contains two rows of integer i, so that the probing of + // each level will increase the result by one row. + // 5. The left-most table is a single batch of enough rows, so that at each level, the + // probing will accumulate enough result rows to have to output to the subsequent level + // before finishing the current batch (releasing the buffer allocated on the temp vector + // stack), which is essentially the recursive usage of the temp vector stack. + + // A fair number of joins to guarantee temp vector stack overflow before GH-41335. + const int num_joins = 64; + + // `ExecBatchBuilder::num_rows_max()` is the number of rows for swiss join to accumulate + // before outputting. + const int num_left_rows = ExecBatchBuilder::num_rows_max(); + ASSERT_OK_AND_ASSIGN( + auto left_batches, + MakeIntegerBatches({[](int row_id) -> int64_t { return row_id; }}, + schema({field("l_key", int32())}), + /*num_batches=*/1, /*batch_size=*/num_left_rows)); + Declaration root{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(left_batches.schema), + std::move(left_batches.batches))}; + + HashJoinNodeOptions join_opts(JoinType::LEFT_OUTER, /*left_keys=*/{"l_key"}, + /*right_keys=*/{"r_key"}); + + for (int i = 0; i < num_joins; ++i) { + ASSERT_OK_AND_ASSIGN(auto right_batches, + MakeIntegerBatches({[i](int) -> int64_t { return i; }}, + schema({field("r_key", int32())}), + /*num_batches=*/1, /*batch_size=*/2)); + Declaration table{"exec_batch_source", + ExecBatchSourceNodeOptions(std::move(right_batches.schema), + std::move(right_batches.batches))}; + + Declaration new_root{"hashjoin", {std::move(root), std::move(table)}, join_opts}; + root = std::move(new_root); + } + + ASSERT_OK_AND_ASSIGN(std::ignore, DeclarationToTable(std::move(root))); +} + } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/query_context.cc b/cpp/src/arrow/acero/query_context.cc index a27397d12079d..18beb19ab7f8b 100644 --- a/cpp/src/arrow/acero/query_context.cc +++ b/cpp/src/arrow/acero/query_context.cc @@ -40,8 +40,7 @@ QueryContext::QueryContext(QueryOptions opts, ExecContext exec_context) const CpuInfo* QueryContext::cpu_info() const { return CpuInfo::GetInstance(); } int64_t QueryContext::hardware_flags() const { return cpu_info()->hardware_flags(); } -Status QueryContext::Init(size_t max_num_threads, util::AsyncTaskScheduler* scheduler) { - tld_.resize(max_num_threads); +Status QueryContext::Init(util::AsyncTaskScheduler* scheduler) { async_scheduler_ = scheduler; return Status::OK(); } @@ -50,15 +49,6 @@ size_t QueryContext::GetThreadIndex() { return thread_indexer_(); } size_t QueryContext::max_concurrency() const { return thread_indexer_.Capacity(); } -Result QueryContext::GetTempStack(size_t thread_index) { - if (!tld_[thread_index].is_init) { - RETURN_NOT_OK(tld_[thread_index].stack.Init( - memory_pool(), 32 * util::MiniBatch::kMiniBatchLength * sizeof(uint64_t))); - tld_[thread_index].is_init = true; - } - return &tld_[thread_index].stack; -} - Result> QueryContext::BeginExternalTask(std::string_view name) { Future<> completion_future = Future<>::Make(); if (async_scheduler_->AddSimpleTask([completion_future] { return completion_future; }, diff --git a/cpp/src/arrow/acero/query_context.h b/cpp/src/arrow/acero/query_context.h index 9ea11679cba05..3eff299439828 100644 --- a/cpp/src/arrow/acero/query_context.h +++ b/cpp/src/arrow/acero/query_context.h @@ -38,7 +38,7 @@ class ARROW_ACERO_EXPORT QueryContext { QueryContext(QueryOptions opts = {}, ExecContext exec_context = *default_exec_context()); - Status Init(size_t max_num_threads, arrow::util::AsyncTaskScheduler* scheduler); + Status Init(arrow::util::AsyncTaskScheduler* scheduler); const ::arrow::internal::CpuInfo* cpu_info() const; int64_t hardware_flags() const; @@ -52,7 +52,6 @@ class ARROW_ACERO_EXPORT QueryContext { size_t GetThreadIndex(); size_t max_concurrency() const; - Result GetTempStack(size_t thread_index); /// \brief Start an external task /// @@ -145,11 +144,6 @@ class ARROW_ACERO_EXPORT QueryContext { std::unique_ptr task_scheduler_ = TaskScheduler::Make(); ThreadIndexer thread_indexer_; - struct ThreadLocalData { - bool is_init = false; - arrow::util::TempVectorStack stack; - }; - std::vector tld_; std::atomic in_flight_bytes_to_disk_{0}; }; diff --git a/cpp/src/arrow/acero/sink_node.cc b/cpp/src/arrow/acero/sink_node.cc index 4ab6b4537de02..66f447aa87f11 100644 --- a/cpp/src/arrow/acero/sink_node.cc +++ b/cpp/src/arrow/acero/sink_node.cc @@ -423,6 +423,7 @@ class ConsumingSinkNode : public ExecNode, std::atomic backpressure_counter_ = 0; std::unique_ptr sequencer_; }; + static Result MakeTableConsumingSinkNode(ExecPlan* plan, std::vector inputs, const ExecNodeOptions& options) { diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 542e943c4a82b..17c5212697339 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -2470,6 +2470,8 @@ Status JoinProbeProcessor::OnFinished() { class SwissJoin : public HashJoinImpl { public: + static constexpr auto kTempStackUsage = 64 * arrow::util::MiniBatch::kMiniBatchLength; + Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads, const HashJoinProjectionMaps* proj_map_left, const HashJoinProjectionMaps* proj_map_right, @@ -2513,6 +2515,7 @@ class SwissJoin : public HashJoinImpl { local_states_.resize(num_threads_); for (int i = 0; i < num_threads_; ++i) { + RETURN_NOT_OK(local_states_[i].stack.Init(pool_, kTempStackUsage)); local_states_[i].hash_table_ready = false; local_states_[i].num_output_batches = 0; local_states_[i].materialize.Init(pool_, proj_map_left, proj_map_right); @@ -2566,8 +2569,7 @@ class SwissJoin : public HashJoinImpl { ExecBatch keypayload_batch; ARROW_ASSIGN_OR_RAISE(keypayload_batch, KeyPayloadFromInput(/*side=*/0, &batch)); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_index)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_index].stack; return CancelIfNotOK( probe_processor_.OnNextBatch(thread_index, keypayload_batch, temp_stack, @@ -2679,8 +2681,7 @@ class SwissJoin : public HashJoinImpl { input_batch.values[schema->num_cols(HashJoinProjection::KEY) + icol]; } } - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; RETURN_NOT_OK(CancelIfNotOK(hash_table_build_.PushNextBatch( static_cast(thread_id), key_batch, no_payload ? nullptr : &payload_batch, temp_stack))); @@ -2715,8 +2716,7 @@ class SwissJoin : public HashJoinImpl { Status MergeFinished(size_t thread_id) { RETURN_NOT_OK(status()); - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; hash_table_build_.FinishPrtnMerge(temp_stack); return CancelIfNotOK(OnBuildHashTableFinished(static_cast(thread_id))); } @@ -2771,8 +2771,7 @@ class SwissJoin : public HashJoinImpl { std::min((task_id + 1) * kNumRowsPerScanTask, hash_table_.num_rows()); // Get thread index and related temp vector stack // - ARROW_ASSIGN_OR_RAISE(arrow::util::TempVectorStack * temp_stack, - ctx_->GetTempStack(thread_id)); + arrow::util::TempVectorStack* temp_stack = &local_states_[thread_id].stack; // Split into mini-batches // @@ -2949,6 +2948,7 @@ class SwissJoin : public HashJoinImpl { FinishedCallback finished_callback_; struct ThreadLocalState { + arrow::util::TempVectorStack stack; JoinResultMaterialize materialize; std::vector temp_column_arrays; int64_t num_output_batches; diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 18afcc90d71f8..e79ce6fe172b2 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1287,7 +1287,7 @@ TEST_F(TestMapArray, ValidateErrorNullKey) { } TEST_F(TestMapArray, FromArrays) { - std::shared_ptr offsets1, offsets2, offsets3, offsets4, keys, items; + std::shared_ptr offsets1, offsets2, offsets3, offsets4, offsets5, keys, items; std::vector offsets_is_valid3 = {true, false, true, true}; std::vector offsets_is_valid4 = {true, true, false, true}; @@ -1342,6 +1342,20 @@ TEST_F(TestMapArray, FromArrays) { // Zero-length offsets ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1->Slice(0, 0), keys, items, pool_)); + // Offseted offsets + ASSERT_OK_AND_ASSIGN(auto map5, + MapArray::FromArrays(offsets1->Slice(1), keys, items, pool_)); + ASSERT_OK(map5->Validate()); + + AssertArraysEqual(*expected1.Slice(1), *map5); + + std::vector offset5_values = {2, 2, 6}; + ArrayFromVector(offset5_values, &offsets5); + ASSERT_OK_AND_ASSIGN(auto map6, MapArray::FromArrays(offsets5, keys, items, pool_)); + ASSERT_OK(map6->Validate()); + + AssertArraysEqual(*map5, *map6); + // Offsets not the right type ASSERT_RAISES(TypeError, MapArray::FromArrays(keys, offsets1, items, pool_)); diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 24e0dfb7081ac..1be771d8228d9 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -790,7 +790,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, const std::shared_ptr& items, int64_t null_count, int64_t offset) { auto pair_data = ArrayData::Make(type->fields()[0]->type(), keys->data()->length, - {nullptr}, {keys->data(), items->data()}, 0, offset); + {nullptr}, {keys->data(), items->data()}, 0); auto map_data = ArrayData::Make(type, length, std::move(buffers), {pair_data}, null_count, offset); SetData(map_data); diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index af64908b59582..7e25ad61fa2ea 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -827,6 +827,9 @@ TEST_F(TestArray, TestFillFromScalar) { // GH-40069: Data-race when concurrent calling ArraySpan::FillFromScalar of the same // scalar instance. TEST_F(TestArray, TestConcurrentFillFromScalar) { +#ifndef ARROW_ENABLE_THREADING + GTEST_SKIP() << "Test requires threading support"; +#endif for (auto type : TestArrayUtilitiesAgainstTheseTypes()) { ARROW_SCOPED_TRACE("type = ", type->ToString()); for (auto seed : {0u, 0xdeadbeef, 42u}) { diff --git a/cpp/src/arrow/chunk_resolver.cc b/cpp/src/arrow/chunk_resolver.cc index 29bccb52658f8..55eec53ced1c7 100644 --- a/cpp/src/arrow/chunk_resolver.cc +++ b/cpp/src/arrow/chunk_resolver.cc @@ -19,14 +19,14 @@ #include #include +#include #include #include #include "arrow/array.h" #include "arrow/record_batch.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { template @@ -54,6 +54,51 @@ inline std::vector MakeChunksOffsets(const std::vector& chunks) { offsets[chunks.size()] = offset; return offsets; } + +/// \pre all the pre-conditions of ChunkResolver::ResolveMany() +/// \pre num_offsets - 1 <= std::numeric_limits::max() +template +void ResolveManyInline(size_t num_offsets, const int64_t* signed_offsets, + int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint, + IndexType* out_index_in_chunk_vec) { + auto* offsets = reinterpret_cast(signed_offsets); + const auto num_chunks = static_cast(num_offsets - 1); + // chunk_hint in [0, num_offsets) per the precondition. + for (int64_t i = 0; i < n_indices; i++) { + const auto index = static_cast(logical_index_vec[i]); + if (index >= offsets[chunk_hint] && + (chunk_hint == num_chunks || index < offsets[chunk_hint + 1])) { + out_chunk_index_vec[i] = chunk_hint; // hint is correct! + continue; + } + // lo < hi is guaranteed by `num_offsets = chunks.size() + 1` + auto chunk_index = + ChunkResolver::Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets); + chunk_hint = static_cast(chunk_index); + out_chunk_index_vec[i] = chunk_hint; + } + if (out_index_in_chunk_vec != NULLPTR) { + for (int64_t i = 0; i < n_indices; i++) { + auto logical_index = logical_index_vec[i]; + auto chunk_index = out_chunk_index_vec[i]; + // chunk_index is in [0, chunks.size()] no matter what the + // value of logical_index is, so it's always safe to dereference + // offset_ as it contains chunks.size()+1 values. + out_index_in_chunk_vec[i] = + logical_index - static_cast(offsets[chunk_index]); +#if defined(ARROW_VALGRIND) || defined(ADDRESS_SANITIZER) + // Make it more likely that Valgrind/ASAN can catch an invalid memory + // access by poisoning out_index_in_chunk_vec[i] when the logical + // index is out-of-bounds. + if (chunk_index == num_chunks) { + out_index_in_chunk_vec[i] = std::numeric_limits::max(); + } +#endif + } + } +} + } // namespace ChunkResolver::ChunkResolver(const ArrayVector& chunks) noexcept @@ -84,5 +129,32 @@ ChunkResolver& ChunkResolver::operator=(const ChunkResolver& other) noexcept { return *this; } -} // namespace internal -} // namespace arrow +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint8_t* logical_index_vec, + uint8_t* out_chunk_index_vec, uint8_t chunk_hint, + uint8_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint32_t* logical_index_vec, + uint32_t* out_chunk_index_vec, uint32_t chunk_hint, + uint32_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint16_t* logical_index_vec, + uint16_t* out_chunk_index_vec, uint16_t chunk_hint, + uint16_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +void ChunkResolver::ResolveManyImpl(int64_t n_indices, const uint64_t* logical_index_vec, + uint64_t* out_chunk_index_vec, uint64_t chunk_hint, + uint64_t* out_index_in_chunk_vec) const { + ResolveManyInline(offsets_.size(), offsets_.data(), n_indices, logical_index_vec, + out_chunk_index_vec, chunk_hint, out_index_in_chunk_vec); +} + +} // namespace arrow::internal diff --git a/cpp/src/arrow/chunk_resolver.h b/cpp/src/arrow/chunk_resolver.h index c5dad1a17b18e..a2a3d5a864243 100644 --- a/cpp/src/arrow/chunk_resolver.h +++ b/cpp/src/arrow/chunk_resolver.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include "arrow/type_fwd.h" @@ -27,6 +29,8 @@ namespace arrow::internal { +struct ChunkResolver; + struct ChunkLocation { /// \brief Index of the chunk in the array of chunks /// @@ -36,8 +40,17 @@ struct ChunkLocation { /// \brief Index of the value in the chunk /// - /// The value is undefined if chunk_index >= chunks.size() + /// The value is UNDEFINED if chunk_index >= chunks.size() int64_t index_in_chunk = 0; + + ChunkLocation() = default; + + ChunkLocation(int64_t chunk_index, int64_t index_in_chunk) + : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {} + + bool operator==(ChunkLocation other) const { + return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk; + } }; /// \brief An utility that incrementally resolves logical indices into @@ -60,12 +73,35 @@ struct ARROW_EXPORT ChunkResolver { explicit ChunkResolver(const std::vector& chunks) noexcept; explicit ChunkResolver(const RecordBatchVector& batches) noexcept; + /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets. + /// + /// The first offset must be 0 and the last offset must be the logical length of the + /// chunked array. Each offset before the last represents the starting logical index of + /// the corresponding chunk. + explicit ChunkResolver(std::vector offsets) noexcept + : offsets_(std::move(offsets)), cached_chunk_(0) { +#ifndef NDEBUG + assert(offsets_.size() >= 1); + assert(offsets_[0] == 0); + for (size_t i = 1; i < offsets_.size(); i++) { + assert(offsets_[i] >= offsets_[i - 1]); + } +#endif + } + ChunkResolver(ChunkResolver&& other) noexcept; ChunkResolver& operator=(ChunkResolver&& other) noexcept; ChunkResolver(const ChunkResolver& other) noexcept; ChunkResolver& operator=(const ChunkResolver& other) noexcept; + int64_t logical_array_length() const { return offsets_.back(); } + int64_t num_chunks() const { return static_cast(offsets_.size()) - 1; } + + int64_t chunk_length(int64_t chunk_index) const { + return offsets_[chunk_index + 1] - offsets_[chunk_index]; + } + /// \brief Resolve a logical index to a ChunkLocation. /// /// The returned ChunkLocation contains the chunk index and the within-chunk index @@ -81,7 +117,7 @@ struct ARROW_EXPORT ChunkResolver { const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed); const auto chunk_index = ResolveChunkIndex(index, cached_chunk); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; } /// \brief Resolve a logical index to a ChunkLocation. @@ -97,12 +133,70 @@ struct ARROW_EXPORT ChunkResolver { /// \return ChunkLocation with a valid chunk_index if index is within /// bounds, or with chunk_index == chunks.size() if logical index is /// `>= chunked_array.length()`. - inline ChunkLocation ResolveWithChunkIndexHint(int64_t index, - ChunkLocation hint) const { + inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const { assert(hint.chunk_index < static_cast(offsets_.size())); const auto chunk_index = ResolveChunkIndex(index, hint.chunk_index); - return {chunk_index, index - offsets_[chunk_index]}; + return ChunkLocation{chunk_index, index - offsets_[chunk_index]}; + } + + /// \brief Resolve `n_indices` logical indices to chunk indices. + /// + /// \pre 0 <= logical_index_vec[i] < logical_array_length() + /// (for well-defined and valid chunk index results) + /// \pre out_chunk_index_vec has space for `n_indices` + /// \pre chunk_hint in [0, chunks.size()] + /// \post out_chunk_index_vec[i] in [0, chunks.size()] for i in [0, n) + /// \post if logical_index_vec[i] >= chunked_array.length(), then + /// out_chunk_index_vec[i] == chunks.size() + /// and out_index_in_chunk_vec[i] is UNDEFINED (can be out-of-bounds) + /// \post if logical_index_vec[i] < 0, then both out_chunk_index_vec[i] and + /// out_index_in_chunk_vec[i] are UNDEFINED + /// + /// \param n_indices The number of logical indices to resolve + /// \param logical_index_vec The logical indices to resolve + /// \param out_chunk_index_vec The output array where the chunk indices will be written + /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany + /// \param out_index_in_chunk_vec If not NULLPTR, the output array where the + /// within-chunk indices will be written + /// \return false iff chunks.size() > std::numeric_limits::max() + template + [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec, + IndexType* out_chunk_index_vec, IndexType chunk_hint = 0, + IndexType* out_index_in_chunk_vec = NULLPTR) const { + if constexpr (sizeof(IndexType) < sizeof(uint64_t)) { + // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()). + constexpr uint64_t kMaxIndexTypeValue = std::numeric_limits::max(); + // A ChunkedArray with enough empty chunks can make the index of a chunk + // exceed the logical index and thus the maximum value of IndexType. + const bool chunk_index_fits_on_type = + static_cast(offsets_.size() - 1) <= kMaxIndexTypeValue; + if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) { + return false; + } + // Since an index-in-chunk cannot possibly exceed the logical index being + // queried, we don't have to worry about these values not fitting on IndexType. + } + if constexpr (std::is_signed_v) { + // We interpret signed integers as unsigned and avoid having to generate double + // the amount of binary code to handle each integer width. + // + // Negative logical indices can become large values when cast to unsigned, and + // they are gracefully handled by ResolveManyImpl, but both the chunk index + // and the index in chunk values will be undefined in these cases. This + // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid + // logical index in the chunked array. + using U = std::make_unsigned_t; + ResolveManyImpl(n_indices, reinterpret_cast(logical_index_vec), + reinterpret_cast(out_chunk_index_vec), + static_cast(chunk_hint), + reinterpret_cast(out_index_in_chunk_vec)); + } else { + static_assert(std::is_unsigned_v); + ResolveManyImpl(n_indices, logical_index_vec, out_chunk_index_vec, chunk_hint, + out_index_in_chunk_vec); + } + return true; } private: @@ -130,17 +224,33 @@ struct ARROW_EXPORT ChunkResolver { return chunk_index; } + /// \pre all the pre-conditions of ChunkResolver::ResolveMany() + /// \pre num_offsets - 1 <= std::numeric_limits::max() + void ResolveManyImpl(int64_t, const uint8_t*, uint8_t*, uint8_t, uint8_t*) const; + void ResolveManyImpl(int64_t, const uint16_t*, uint16_t*, uint16_t, uint16_t*) const; + void ResolveManyImpl(int64_t, const uint32_t*, uint32_t*, uint32_t, uint32_t*) const; + void ResolveManyImpl(int64_t, const uint64_t*, uint64_t*, uint64_t, uint64_t*) const; + + public: /// \brief Find the index of the chunk that contains the logical index. /// /// Any non-negative index is accepted. When `hi=num_offsets`, the largest /// possible return value is `num_offsets-1` which is equal to - /// `chunks.size()`. The is returned when the logical index is out-of-bounds. + /// `chunks.size()`. Which is returned when the logical index is greater or + /// equal the logical length of the chunked array. /// - /// \pre index >= 0 + /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned) /// \pre lo < hi /// \pre lo >= 0 && hi <= offsets_.size() static inline int64_t Bisect(int64_t index, const int64_t* offsets, int64_t lo, int64_t hi) { + return Bisect(static_cast(index), + reinterpret_cast(offsets), static_cast(lo), + static_cast(hi)); + } + + static inline int64_t Bisect(uint64_t index, const uint64_t* offsets, uint64_t lo, + uint64_t hi) { // Similar to std::upper_bound(), but slightly different as our offsets // array always starts with 0. auto n = hi - lo; @@ -148,8 +258,8 @@ struct ARROW_EXPORT ChunkResolver { // (lo < hi is guaranteed by the precondition). assert(n > 1 && "lo < hi is a precondition of Bisect"); do { - const int64_t m = n >> 1; - const int64_t mid = lo + m; + const uint64_t m = n >> 1; + const uint64_t mid = lo + m; if (index >= offsets[mid]) { lo = mid; n -= m; diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index 6ca52ab46ca68..e9cc283b53cd5 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/chunk_resolver.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/testing/builder.h" @@ -34,6 +35,9 @@ namespace arrow { +using internal::ChunkLocation; +using internal::ChunkResolver; + class TestChunkedArray : public ::testing::Test { protected: virtual void Construct() { @@ -310,4 +314,200 @@ TEST_F(TestChunkedArray, GetScalar) { ASSERT_RAISES(IndexError, carr.GetScalar(7)); } +// ChunkResolver tests + +using IndexTypes = ::testing::Types; + +TEST(TestChunkResolver, Resolve) { + ChunkResolver empty(std::vector({0})); // [] + // ChunkLocation::index_in_chunk is undefined when chunk_index==chunks.size(), + // so only chunk_index is compared in these cases. + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + ASSERT_EQ(empty.Resolve(0).chunk_index, 0); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + ASSERT_EQ(one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one.Resolve(1).chunk_index, 1); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_and_empty.Resolve(1).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(2).chunk_index, 3); + ASSERT_EQ(one_and_empty.Resolve(3).chunk_index, 3); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(one_one_one.Resolve(1), (ChunkLocation(1, 0))); + ASSERT_EQ(one_one_one.Resolve(2), (ChunkLocation(2, 0))); + ASSERT_EQ(one_one_one.Resolve(3).chunk_index, 3); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(0), (ChunkLocation(0, 0))); + ASSERT_EQ(resolver.Resolve(1), (ChunkLocation(0, 1))); + ASSERT_EQ(resolver.Resolve(2), (ChunkLocation(1, 0))); + ASSERT_EQ(resolver.Resolve(3), (ChunkLocation(2, 0))); + ASSERT_EQ(resolver.Resolve(4), (ChunkLocation(2, 1))); + ASSERT_EQ(resolver.Resolve(8), (ChunkLocation(2, 5))); + ASSERT_EQ(resolver.Resolve(9), (ChunkLocation(2, 6))); + ASSERT_EQ(resolver.Resolve(10).chunk_index, 3); +} + +template +class TestChunkResolverMany : public ::testing::Test { + public: + using IndexType = T; + + Result> ResolveMany( + const ChunkResolver& resolver, const std::vector& logical_index_vec) { + const size_t n = logical_index_vec.size(); + std::vector chunk_index_vec; + chunk_index_vec.resize(n); + std::vector index_in_chunk_vec; + index_in_chunk_vec.resize(n); + bool valid = resolver.ResolveMany( + static_cast(n), logical_index_vec.data(), chunk_index_vec.data(), 0, + index_in_chunk_vec.data()); + if (ARROW_PREDICT_FALSE(!valid)) { + return Status::Invalid("index type doesn't fit possible chunk indexes"); + } + std::vector locations; + locations.reserve(n); + for (size_t i = 0; i < n; i++) { + auto chunk_index = static_cast(chunk_index_vec[i]); + auto index_in_chunk = static_cast(index_in_chunk_vec[i]); + locations.emplace_back(chunk_index, index_in_chunk); + } + return locations; + } + + void CheckResolveMany(const ChunkResolver& resolver, + const std::vector& logical_index_vec) { + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + IndexType logical_index = logical_index_vec[i]; + const auto expected = resolver.Resolve(logical_index); + ASSERT_LE(expected.chunk_index, resolver.num_chunks()); + if (expected.chunk_index == resolver.num_chunks()) { + // index_in_chunk is undefined in this case + ASSERT_EQ(locations[i].chunk_index, expected.chunk_index); + } else { + ASSERT_EQ(locations[i], expected); + } + } + } + + void TestBasics() { + std::vector logical_index_vec; + + ChunkResolver empty(std::vector({0})); // [] + logical_index_vec = {0, 0}; + CheckResolveMany(empty, logical_index_vec); + + ChunkResolver one(std::vector({0, 1})); // [[0]] + logical_index_vec = {1, 0, 1}; + CheckResolveMany(one, logical_index_vec); + + ChunkResolver one_and_empty(std::vector({0, 1, 1, 1})); // [[0], [], []] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_and_empty, logical_index_vec); + + ChunkResolver one_one_one(std::vector({0, 1, 2, 3})); // [[0], [1], [2]] + logical_index_vec = {3, 2, 1, 0, 1, 2, 3}; + CheckResolveMany(one_one_one, logical_index_vec); + + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + logical_index_vec = {10, 9, 8, 4, 3, 2, 1, 0, 1, 2, 3, 4, 8, 9, 10}; + CheckResolveMany(resolver, logical_index_vec); + } + + void TestOutOfBounds() { + ChunkResolver resolver(std::vector({0, 2, 3, 10})); // [[0, 1], [2], [3..9]] + + std::vector logical_index_vec = {10, 11, 12, 13, 14, 13, 11, 10}; + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (std::is_signed_v) { + std::vector logical_index_vec = {-1, -2, -3, -4, INT8_MIN}; + + ChunkResolver resolver(std::vector({0, 2, 128})); // [[0, 1], [2..127]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + // All the negative indices are greater than resolver.logical_array_length()-1 + // when cast to uint8_t. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + + if constexpr (sizeof(IndexType) == 1) { + ChunkResolver resolver(std::vector( + {0, 2, 128, 129, 256})); // [[0, 1], [2..127], [128], [129, 255]] + ASSERT_OK_AND_ASSIGN(auto locations, ResolveMany(resolver, logical_index_vec)); + EXPECT_EQ(logical_index_vec.size(), locations.size()); + for (size_t i = 0; i < logical_index_vec.size(); i++) { + if constexpr (sizeof(IndexType) == 1) { + // All the negative 8-bit indices are SMALLER than + // resolver.logical_array_length()=256 when cast to 8-bit unsigned integers. + // So the resolved locations might look valid, but they should not be trusted. + ASSERT_LT(locations[i].chunk_index, resolver.num_chunks()); + } else { + // All the negative indices are greater than resolver.logical_array_length() + // when cast to 16/32/64-bit unsigned integers. + ASSERT_EQ(locations[i].chunk_index, resolver.num_chunks()); + } + } + } + } + } + + void TestOverflow() { + const int64_t kMaxIndex = std::is_signed_v ? 127 : 255; + std::vector logical_index_vec = {0, 1, 2, + static_cast(kMaxIndex)}; + + // Overflows are rare because to make them possible, we need more chunks + // than logical elements in the ChunkedArray. That requires at least one + // empty chunk. + std::vector offsets; + for (int64_t i = 0; i <= kMaxIndex; i++) { + offsets.push_back(i); + } + ChunkResolver resolver{offsets}; + ASSERT_OK(ResolveMany(resolver, logical_index_vec)); + + offsets.push_back(kMaxIndex); // adding an empty chunk + ChunkResolver resolver_with_empty{offsets}; + if (sizeof(IndexType) == 1) { + ASSERT_NOT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } else { + ASSERT_OK(ResolveMany(resolver_with_empty, logical_index_vec)); + } + } +}; + +TYPED_TEST_SUITE(TestChunkResolverMany, IndexTypes); + +TYPED_TEST(TestChunkResolverMany, Basics) { this->TestBasics(); } +TYPED_TEST(TestChunkResolverMany, OutOfBounds) { this->TestOutOfBounds(); } +TYPED_TEST(TestChunkResolverMany, Overflow) { this->TestOverflow(); } + } // namespace arrow diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index badcf4f2f26ac..fb778be113029 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -90,7 +90,8 @@ add_arrow_test(internals_test light_array_test.cc registry_test.cc key_hash_test.cc - row/compare_test.cc) + row/compare_test.cc + row/grouper_test.cc) add_arrow_compute_test(expression_test SOURCES expression_test.cc) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index d47ee42ebf239..f0d5c0fcc3d72 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -153,6 +153,8 @@ static auto kRankOptionsType = GetFunctionOptionsType( DataMember("tiebreaker", &RankOptions::tiebreaker)); static auto kPairwiseOptionsType = GetFunctionOptionsType( DataMember("periods", &PairwiseOptions::periods)); +static auto kListFlattenOptionsType = GetFunctionOptionsType( + DataMember("recursive", &ListFlattenOptions::recursive)); } // namespace } // namespace internal @@ -224,6 +226,10 @@ PairwiseOptions::PairwiseOptions(int64_t periods) : FunctionOptions(internal::kPairwiseOptionsType), periods(periods) {} constexpr char PairwiseOptions::kTypeName[]; +ListFlattenOptions::ListFlattenOptions(bool recursive) + : FunctionOptions(internal::kListFlattenOptionsType), recursive(recursive) {} +constexpr char ListFlattenOptions::kTypeName[]; + namespace internal { void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kFilterOptionsType)); @@ -237,6 +243,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPairwiseOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kListFlattenOptionsType)); } } // namespace internal diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index 919572f16ee69..e5bcc37329661 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -245,6 +245,18 @@ class ARROW_EXPORT PairwiseOptions : public FunctionOptions { int64_t periods = 1; }; +/// \brief Options for list_flatten function +class ARROW_EXPORT ListFlattenOptions : public FunctionOptions { + public: + explicit ListFlattenOptions(bool recursive = false); + static constexpr char const kTypeName[] = "ListFlattenOptions"; + static ListFlattenOptions Defaults() { return ListFlattenOptions(); } + + /// \brief If true, the list is flattened recursively until a non-list + /// array is formed. + bool recursive = false; +}; + /// @} /// \brief Filter with a boolean selection filter diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc index 00a833742f957..0fd9cae7a8d71 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.cc +++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/compute/api_vector.h" #include "arrow/type_fwd.h" namespace arrow { @@ -56,9 +57,23 @@ Result LastType(KernelContext*, const std::vector& types return types.back(); } -Result ListValuesType(KernelContext*, const std::vector& args) { - const auto& list_type = checked_cast(*args[0].type); - return list_type.value_type().get(); +Result ListValuesType(KernelContext* ctx, + const std::vector& args) { + auto list_type = checked_cast(args[0].type); + auto value_type = list_type->value_type().get(); + + auto recursive = + ctx->state() ? OptionsWrapper::Get(ctx).recursive : false; + if (!recursive) { + return value_type; + } + + for (auto value_kind = value_type->id(); + is_list(value_kind) || is_list_view(value_kind); value_kind = value_type->id()) { + list_type = checked_cast(list_type->value_type().get()); + value_type = list_type->value_type().get(); + } + return value_type; } void EnsureDictionaryDecoded(std::vector* types) { diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 097ee1de45b6a..9e46a21887f8c 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -423,7 +423,8 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar Result FirstType(KernelContext*, const std::vector& types); Result LastType(KernelContext*, const std::vector& types); -Result ListValuesType(KernelContext*, const std::vector& types); +Result ListValuesType(KernelContext* ctx, + const std::vector& types); // ---------------------------------------------------------------------- // Helpers for iterating over common DataType instances for adding kernels to diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc index 3a8352a9b870f..dc3fe29a3dfae 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc @@ -340,10 +340,15 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou if (input.offset == output->offset) { output->buffers[0] = input.GetBuffer(0); } else { - ARROW_ASSIGN_OR_RAISE( - output->buffers[0], - arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, - input.offset, input.length)); + // When the offsets are different (e.g., due to slice operation), we need to check if + // the null bitmap buffer is not null before copying it. The null bitmap buffer can be + // null if the input array value does not contain any null value. + if (input.buffers[0].data != NULLPTR) { + ARROW_ASSIGN_OR_RAISE( + output->buffers[0], + arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data, + input.offset, input.length)); + } } // This buffer is preallocated diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc index af62b4da2caa5..a6d7f6097b59b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_cast_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_cast_test.cc @@ -2196,6 +2196,33 @@ TEST(Cast, BinaryOrStringToFixedSizeBinary) { } } +TEST(Cast, FixedSizeBinaryToBinaryOrString) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + + CheckCast(valid_input, ArrayFromJSON(out_type, R"(["foo", null, "bar", "baz", + "quu"])")); + + auto empty_input = ArrayFromJSON(fixed_size_binary(3), "[]"); + CheckCast(empty_input, ArrayFromJSON(out_type, "[]")); + } +} + +TEST(Cast, FixedSizeBinaryToBinaryOrStringWithSlice) { + for (auto out_type : {utf8(), large_utf8(), binary(), large_binary()}) { + auto valid_input = ArrayFromJSON(fixed_size_binary(3), R"(["foo", null, "bar", + "baz", "quu"])"); + auto sliced = valid_input->Slice(1, 3); + CheckCast(sliced, ArrayFromJSON(out_type, R"([null, "bar", "baz"])")); + + auto valid_input_without_null = ArrayFromJSON(fixed_size_binary(3), R"(["foo", "bar", + "baz", "quu"])"); + auto sliced_without_null = valid_input_without_null->Slice(1, 3); + CheckCast(sliced_without_null, ArrayFromJSON(out_type, R"(["bar", "baz", "quu"])")); + } +} + TEST(Cast, IntToString) { for (auto string_type : {utf8(), large_utf8()}) { CheckCast(ArrayFromJSON(int8(), "[0, 1, 127, -128, null]"), diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc index ee181c053c053..6368ef525ff9c 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc @@ -1309,9 +1309,10 @@ void AddFixedWidthIfElseKernel(const std::shared_ptr& scalar_fun } void AddNestedIfElseKernels(const std::shared_ptr& scalar_function) { - for (const auto type_id : {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, - Type::LARGE_LIST_VIEW, Type::FIXED_SIZE_LIST, Type::STRUCT, - Type::DENSE_UNION, Type::SPARSE_UNION, Type::DICTIONARY}) { + for (const auto type_id : + {Type::LIST, Type::LARGE_LIST, Type::LIST_VIEW, Type::LARGE_LIST_VIEW, + Type::FIXED_SIZE_LIST, Type::MAP, Type::STRUCT, Type::DENSE_UNION, + Type::SPARSE_UNION, Type::DICTIONARY}) { ScalarKernel kernel({boolean(), InputType(type_id), InputType(type_id)}, LastType, NestedIfElseExec::Exec); kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE; @@ -1482,39 +1483,27 @@ Status ExecScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch, ExecResult* result = temp.get(); } - // TODO(wesm): clean this up to have less duplication - if (out->is_array_data()) { - ArrayData* output = out->array_data().get(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - if (dict_from.is_scalar()) { - output->dictionary = checked_cast(*dict_from.scalar) - .value.dictionary->data(); - } else { - output->dictionary = dict_from.array.ToArrayData()->dictionary; - } - } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetMutableValues(0, 0), - output->GetMutableValues(1, 0), output->offset); - } else { - // ArraySpan - ArraySpan* output = out->array_span_mutable(); - if (is_dictionary_type::value) { - const ExecValue& dict_from = has_result ? result : batch[1]; - output->child_data.resize(1); - if (dict_from.is_scalar()) { - output->child_data[0].SetMembers( - *checked_cast(*dict_from.scalar) - .value.dictionary->data()); - } else { - output->child_data[0] = dict_from.array; - } + // Only input types of non-fixed length (which cannot be pre-allocated) + // will save the output data in ArrayData. And make sure the FixedLength + // types must be output in ArraySpan. + static_assert(is_fixed_width(Type::type_id)); + DCHECK(out->is_array_span()); + + ArraySpan* output = out->array_span_mutable(); + if (is_dictionary_type::value) { + const ExecValue& dict_from = has_result ? result : batch[1]; + output->child_data.resize(1); + if (dict_from.is_scalar()) { + output->child_data[0].SetMembers( + *checked_cast(*dict_from.scalar) + .value.dictionary->data()); + } else { + output->child_data[0] = dict_from.array; } - CopyValues(result, /*in_offset=*/0, batch.length, - output->GetValues(0, 0), output->GetValues(1, 0), - output->offset); } + CopyValues(result, /*in_offset=*/0, batch.length, + output->GetValues(0, 0), output->GetValues(1, 0), + output->offset); return Status::OK(); } @@ -1847,6 +1836,48 @@ struct CaseWhenFunctor> { } }; +// TODO(GH-41453): a more efficient implementation for list-views is possible +template +struct CaseWhenFunctor> { + using offset_type = typename Type::offset_type; + using BuilderType = typename TypeTraits::BuilderType; + static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + /// TODO(wesm): should this be a DCHECK? Or checked elsewhere + if (batch[0].null_count() > 0) { + return Status::Invalid("cond struct must not have outer nulls"); + } + if (batch[0].is_scalar()) { + return ExecVarWidthScalarCaseWhen(ctx, batch, out); + } + return ExecArray(ctx, batch, out); + } + + static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + return ExecVarWidthArrayCaseWhen( + ctx, batch, out, + // ReserveData + [&](ArrayBuilder* raw_builder) { + auto builder = checked_cast(raw_builder); + auto child_builder = builder->value_builder(); + + int64_t reservation = 0; + for (int arg = 1; arg < batch.num_values(); arg++) { + const ExecValue& source = batch[arg]; + if (!source.is_array()) { + const auto& scalar = checked_cast(*source.scalar); + if (!scalar.value) continue; + reservation = + std::max(reservation, batch.length * scalar.value->length()); + } else { + const ArraySpan& array = source.array; + reservation = std::max(reservation, array.child_data[0].length); + } + } + return child_builder->Reserve(reservation); + }); + } +}; + // No-op reserve function, pulled out to avoid apparent miscompilation on MinGW Status ReserveNoData(ArrayBuilder*) { return Status::OK(); } @@ -2712,6 +2743,25 @@ void AddBinaryCaseWhenKernels(const std::shared_ptr& scalar_fu } } +template +void AddNestedCaseWhenKernel(const std::shared_ptr& scalar_function) { + AddCaseWhenKernel(scalar_function, ArrowNestedType::type_id, + CaseWhenFunctor::Exec); +} + +void AddNestedCaseWhenKernels(const std::shared_ptr& scalar_function) { + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); + AddNestedCaseWhenKernel(scalar_function); +} + void AddCoalesceKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, FirstType, @@ -2731,6 +2781,25 @@ void AddPrimitiveCoalesceKernels(const std::shared_ptr& scalar_f } } +template +void AddNestedCoalesceKernel(const std::shared_ptr& scalar_function) { + AddCoalesceKernel(scalar_function, ArrowNestedType::type_id, + CoalesceFunctor::Exec); +} + +void AddNestedCoalesceKernels(const std::shared_ptr& scalar_function) { + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); + AddNestedCoalesceKernel(scalar_function); +} + void AddChooseKernel(const std::shared_ptr& scalar_function, detail::GetTypeId get_id, ArrayKernelExec exec) { ScalarKernel kernel(KernelSignature::Make({Type::INT64, InputType(get_id.id)}, LastType, @@ -2822,15 +2891,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { AddCaseWhenKernel(func, Type::DECIMAL128, CaseWhenFunctor::Exec); AddCaseWhenKernel(func, Type::DECIMAL256, CaseWhenFunctor::Exec); AddBinaryCaseWhenKernels(func, BaseBinaryTypes()); - AddCaseWhenKernel(func, Type::FIXED_SIZE_LIST, - CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::LARGE_LIST, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::MAP, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::STRUCT, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DENSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::SPARSE_UNION, CaseWhenFunctor::Exec); - AddCaseWhenKernel(func, Type::DICTIONARY, CaseWhenFunctor::Exec); + AddNestedCaseWhenKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { @@ -2848,15 +2909,7 @@ void RegisterScalarIfElse(FunctionRegistry* registry) { for (const auto& ty : BaseBinaryTypes()) { AddCoalesceKernel(func, ty, GenerateTypeAgnosticVarBinaryBase(ty)); } - AddCoalesceKernel(func, Type::FIXED_SIZE_LIST, - CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::LARGE_LIST, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::MAP, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::STRUCT, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DENSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::SPARSE_UNION, CoalesceFunctor::Exec); - AddCoalesceKernel(func, Type::DICTIONARY, CoalesceFunctor::Exec); + AddNestedCoalesceKernels(func); DCHECK_OK(registry->AddFunction(std::move(func))); } { diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc index 58bc560f52842..5988908853d50 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_benchmark.cc @@ -284,8 +284,11 @@ static void CaseWhenBench(benchmark::State& state) { state.SetItemsProcessed(state.iterations() * (len - offset)); } -static void CaseWhenBenchList(benchmark::State& state) { - auto type = list(int64()); +template +static void CaseWhenBenchList(benchmark::State& state, + const std::shared_ptr& type) { + using ArrayType = typename TypeTraits::ArrayType; + auto fld = field("", type); int64_t len = state.range(0); @@ -295,17 +298,17 @@ static void CaseWhenBenchList(benchmark::State& state) { auto cond_field = field("cond", boolean(), key_value_metadata({{"null_probability", "0.01"}})); - auto cond = rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), - key_value_metadata({{"null_probability", "0.0"}})), - len); - auto val1 = rand.ArrayOf(*fld, len); - auto val2 = rand.ArrayOf(*fld, len); - auto val3 = rand.ArrayOf(*fld, len); - auto val4 = rand.ArrayOf(*fld, len); + auto cond = std::static_pointer_cast( + rand.ArrayOf(*field("", struct_({cond_field, cond_field, cond_field}), + key_value_metadata({{"null_probability", "0.0"}})), + len)) + ->Slice(offset); + auto val1 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val2 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val3 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); + auto val4 = std::static_pointer_cast(rand.ArrayOf(*fld, len))->Slice(offset); for (auto _ : state) { - ABORT_NOT_OK( - CaseWhen(cond->Slice(offset), {val1->Slice(offset), val2->Slice(offset), - val3->Slice(offset), val4->Slice(offset)})); + ABORT_NOT_OK(CaseWhen(cond, {val1, val2, val3, val4})); } // Set bytes processed to ~length of output @@ -372,6 +375,21 @@ static void CaseWhenBenchStringContiguous(benchmark::State& state) { return CaseWhenBenchContiguous(state); } +template +static void CaseWhenBenchVarLengthListLike(benchmark::State& state) { + auto value_type = TypeTraits::type_singleton(); + auto list_type = std::make_shared(value_type); + return CaseWhenBenchList(state, list_type); +} + +static void CaseWhenBenchListInt64(benchmark::State& state) { + return CaseWhenBenchVarLengthListLike(state); +} + +static void CaseWhenBenchListViewInt64(benchmark::State& state) { + CaseWhenBenchVarLengthListLike(state); +} + struct CoalesceParams { int64_t length; int64_t num_arguments; @@ -533,9 +551,11 @@ BENCHMARK(CaseWhenBench64)->Args({kNumItems, 99}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 0}); BENCHMARK(CaseWhenBench64Contiguous)->Args({kNumItems, 99}); -// CaseWhen: Lists -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 0}); -BENCHMARK(CaseWhenBenchList)->Args({kFewItems, 99}); +// CaseWhen: List-like types +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListInt64)->Args({kFewItems, 99}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 0}); +BENCHMARK(CaseWhenBenchListViewInt64)->Args({kFewItems, 99}); // CaseWhen: Strings BENCHMARK(CaseWhenBenchString)->Args({kFewItems, 0}); diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc index c4c46b5efe84d..9a0ca325277dc 100644 --- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc @@ -896,6 +896,21 @@ TEST_F(TestIfElseKernel, ParameterizedTypes) { {cond, ArrayFromJSON(type0, "[0]"), ArrayFromJSON(type1, "[1]")})); } +TEST_F(TestIfElseKernel, MapNested) { + auto type = map(int64(), utf8()); + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[true, true, false, false]"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[3, "test"]], []])"), + ArrayFromJSON(type, R"([[[1, "b"]], [[2, "c"]], [[7, "abc"]], null])"), + ArrayFromJSON(type, R"([null, [[2, "foo"], [4, null]], [[7, "abc"]], null])")); + + CheckWithDifferentShapes( + ArrayFromJSON(boolean(), "[null, null, null, null]"), + ArrayFromJSON(type, R"([null, [[1, "c"]], [[4, null]], [[6, "ok"]]])"), + ArrayFromJSON(type, R"([[[-1, null]], [[3, "c"]], null, [[6, "ok"]]])"), + ArrayFromJSON(type, R"([null, null, null, null])")); +} + template class TestIfElseUnion : public ::testing::Test {}; @@ -1920,7 +1935,7 @@ TYPED_TEST(TestCaseWhenBinary, Random) { template class TestCaseWhenList : public ::testing::Test {}; -TYPED_TEST_SUITE(TestCaseWhenList, ListArrowTypes); +TYPED_TEST_SUITE(TestCaseWhenList, ListAndListViewArrowTypes); TYPED_TEST(TestCaseWhenList, ListOfString) { auto type = std::make_shared(utf8()); @@ -2555,7 +2570,7 @@ class TestCoalesceList : public ::testing::Test {}; TYPED_TEST_SUITE(TestCoalesceNumeric, IfElseNumericBasedTypes); TYPED_TEST_SUITE(TestCoalesceBinary, BaseBinaryArrowTypes); -TYPED_TEST_SUITE(TestCoalesceList, ListArrowTypes); +TYPED_TEST_SUITE(TestCoalesceList, ListAndListViewArrowTypes); TYPED_TEST(TestCoalesceNumeric, Basics) { auto type = default_type_instance(); diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc index 733ab9c0dc287..b99f065a0b158 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc @@ -23,6 +23,7 @@ #include "arrow/compute/api_scalar.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" +#include "arrow/type_fwd.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" @@ -41,10 +42,17 @@ Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou const ArraySpan& arr = batch[0].array; ArraySpan* out_arr = out->array_span_mutable(); auto out_values = out_arr->GetValues(1); - const offset_type* offsets = arr.GetValues(1); - // Offsets are always well-defined and monotonic, even for null values - for (int64_t i = 0; i < arr.length; ++i) { - *out_values++ = offsets[i + 1] - offsets[i]; + if (is_list_view(*arr.type)) { + const auto* sizes = arr.GetValues(2); + if (arr.length > 0) { + memcpy(out_values, sizes, arr.length * sizeof(offset_type)); + } + } else { + const offset_type* offsets = arr.GetValues(1); + // Offsets are always well-defined and monotonic, even for null values + for (int64_t i = 0; i < arr.length; ++i) { + *out_values++ = offsets[i + 1] - offsets[i]; + } } return Status::OK(); } @@ -59,6 +67,30 @@ Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch, return Status::OK(); } +template +void AddListValueLengthKernel(ScalarFunction* func, + const std::shared_ptr& out_type) { + auto in_type = {InputType(InListType::type_id)}; + ScalarKernel kernel(in_type, out_type, ListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +template <> +void AddListValueLengthKernel( + ScalarFunction* func, const std::shared_ptr& out_type) { + auto in_type = {InputType(Type::FIXED_SIZE_LIST)}; + ScalarKernel kernel(in_type, out_type, FixedSizeListValueLength); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddListValueLengthKernels(ScalarFunction* func) { + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); + AddListValueLengthKernel(func, int64()); + AddListValueLengthKernel(func, int32()); +} + const FunctionDoc list_value_length_doc{ "Compute list lengths", ("`lists` must have a list-like type.\n" @@ -399,6 +431,8 @@ void AddListElementKernels(ScalarFunction* func) { void AddListElementKernels(ScalarFunction* func) { AddListElementKernels(func); AddListElementKernels(func); + AddListElementKernels(func); + AddListElementKernels(func); AddListElementKernels(func); } @@ -824,12 +858,7 @@ const FunctionDoc map_lookup_doc{ void RegisterScalarNested(FunctionRegistry* registry) { auto list_value_length = std::make_shared( "list_value_length", Arity::Unary(), list_value_length_doc); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LIST)}, int32(), - ListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::FIXED_SIZE_LIST)}, int32(), - FixedSizeListValueLength)); - DCHECK_OK(list_value_length->AddKernel({InputType(Type::LARGE_LIST)}, int64(), - ListValueLength)); + AddListValueLengthKernels(list_value_length.get()); DCHECK_OK(registry->AddFunction(std::move(list_value_length))); auto list_element = diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc index a72ec99620b82..32bea8246954d 100644 --- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc @@ -30,11 +30,21 @@ namespace arrow { namespace compute { static std::shared_ptr GetOffsetType(const DataType& type) { - return type.id() == Type::LIST ? int32() : int64(); + switch (type.id()) { + case Type::LIST: + case Type::LIST_VIEW: + return int32(); + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return int64(); + default: + Unreachable("Unexpected type"); + } } TEST(TestScalarNested, ListValueLength) { - for (auto ty : {list(int32()), large_list(int32())}) { + for (auto ty : {list(int32()), large_list(int32()), list_view(int32()), + large_list_view(int32())}) { CheckScalarUnary("list_value_length", ty, "[[0, null, 1], null, [2, 3], []]", GetOffsetType(*ty), "[3, null, 2, 0]"); } @@ -47,7 +57,8 @@ TEST(TestScalarNested, ListValueLength) { TEST(TestScalarNested, ListElementNonFixedListWithNulls) { auto sample = "[[7, 5, 81], [6, null, 4, 7, 8], [3, 12, 2, 0], [1, 9], null]"; for (auto ty : NumericTypes()) { - for (auto list_type : {list(ty), large_list(ty)}) { + for (auto list_type : + {list(ty), large_list(ty), list_view(ty), large_list_view(ty)}) { auto input = ArrayFromJSON(list_type, sample); auto null_input = ArrayFromJSON(list_type, "[null]"); for (auto index_type : IntTypes()) { diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc index 08930e589f7b4..8c77c261c6a98 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested.cc @@ -18,6 +18,7 @@ // Vector kernels involving nested types #include "arrow/array/array_base.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/visit_type_inline.h" @@ -29,8 +30,13 @@ namespace { template Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + auto recursive = OptionsWrapper::Get(ctx).recursive; typename TypeTraits::ArrayType list_array(batch[0].array.ToArrayData()); - ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool())); + + auto pool = ctx->memory_pool(); + ARROW_ASSIGN_OR_RAISE(auto result, (recursive ? list_array.FlattenRecursively(pool) + : list_array.Flatten(pool))); + out->value = std::move(result->data()); return Status::OK(); } @@ -107,10 +113,15 @@ struct ListParentIndicesArray { const FunctionDoc list_flatten_doc( "Flatten list values", - ("`lists` must have a list-like type.\n" - "Return an array with the top list level flattened.\n" - "Top-level null values in `lists` do not emit anything in the input."), - {"lists"}); + ("`lists` must have a list-like type (lists, list-views, and\n" + "fixed-size lists).\n" + "Return an array with the top list level flattened unless\n" + "`recursive` is set to true in ListFlattenOptions. When that\n" + "is that case, flattening happens recursively until a non-list\n" + "array is formed.\n" + "\n" + "Null list values do not emit anything to the output."), + {"lists"}, "ListFlattenOptions"); const FunctionDoc list_parent_indices_doc( "Compute parent indices of nested list values", @@ -153,17 +164,34 @@ class ListParentIndicesFunction : public MetaFunction { } }; +const ListFlattenOptions* GetDefaultListFlattenOptions() { + static const auto kDefaultListFlattenOptions = ListFlattenOptions::Defaults(); + return &kDefaultListFlattenOptions; +} + +template +void AddBaseListFlattenKernels(VectorFunction* func) { + auto in_type = {InputType(InListType::type_id)}; + auto out_type = OutputType(ListValuesType); + VectorKernel kernel(in_type, out_type, ListFlatten, + OptionsWrapper::Init); + DCHECK_OK(func->AddKernel(std::move(kernel))); +} + +void AddBaseListFlattenKernels(VectorFunction* func) { + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); + AddBaseListFlattenKernels(func); +} + } // namespace void RegisterVectorNested(FunctionRegistry* registry) { - auto flatten = - std::make_shared("list_flatten", Arity::Unary(), list_flatten_doc); - DCHECK_OK(flatten->AddKernel({Type::LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::FIXED_SIZE_LIST}, OutputType(ListValuesType), - ListFlatten)); - DCHECK_OK(flatten->AddKernel({Type::LARGE_LIST}, OutputType(ListValuesType), - ListFlatten)); + auto flatten = std::make_shared( + "list_flatten", Arity::Unary(), list_flatten_doc, GetDefaultListFlattenOptions()); + AddBaseListFlattenKernels(flatten.get()); DCHECK_OK(registry->AddFunction(std::move(flatten))); DCHECK_OK(registry->AddFunction(std::make_shared())); diff --git a/cpp/src/arrow/compute/kernels/vector_nested_test.cc b/cpp/src/arrow/compute/kernels/vector_nested_test.cc index eef1b6835ffb5..56604ebd16cc0 100644 --- a/cpp/src/arrow/compute/kernels/vector_nested_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_nested_test.cc @@ -19,6 +19,7 @@ #include "arrow/chunked_array.h" #include "arrow/compute/api.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/kernels/test_util.h" #include "arrow/result.h" #include "arrow/testing/gtest_util.h" @@ -29,38 +30,113 @@ namespace compute { using arrow::internal::checked_cast; -TEST(TestVectorNested, ListFlatten) { - for (auto ty : {list(int16()), large_list(int16())}) { - auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], []]"); - auto expected = ArrayFromJSON(int16(), "[0, null, 1, 2, 3]"); +using ListAndListViewTypes = + ::testing::Types; + +// ---------------------------------------------------------------------- +// [Large]List and [Large]ListView tests +template +class TestVectorNestedSpecialized : public ::testing::Test { + public: + using TypeClass = T; + + void SetUp() override { + value_type_ = int16(); + type_ = std::make_shared(value_type_); + } + + public: + void TestListFlatten() { + auto input = ArrayFromJSON(type_, "[[0, null, 1], null, [2, 3], []]"); + auto expected = ArrayFromJSON(value_type_, "[0, null, 1, 2, 3]"); CheckVectorUnary("list_flatten", input, expected); // Construct a list with a non-empty null slot auto tweaked = TweakValidityBit(input, 0, false); - expected = ArrayFromJSON(int16(), "[2, 3]"); + expected = ArrayFromJSON(value_type_, "[2, 3]"); CheckVectorUnary("list_flatten", tweaked, expected); } -} -TEST(TestVectorNested, ListFlattenNulls) { - const auto ty = list(int32()); - auto input = ArrayFromJSON(ty, "[null, null]"); - auto expected = ArrayFromJSON(int32(), "[]"); - CheckVectorUnary("list_flatten", input, expected); -} + void TestListFlattenNulls() { + value_type_ = int32(); + type_ = std::make_shared(value_type_); + auto input = ArrayFromJSON(type_, "[null, null]"); + auto expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected); + } -TEST(TestVectorNested, ListFlattenChunkedArray) { - for (auto ty : {list(int16()), large_list(int16())}) { - ARROW_SCOPED_TRACE(ty->ToString()); - auto input = ChunkedArrayFromJSON(ty, {"[[0, null, 1], null]", "[[2, 3], []]"}); - auto expected = ChunkedArrayFromJSON(int16(), {"[0, null, 1]", "[2, 3]"}); + void TestListFlattenChunkedArray() { + ARROW_SCOPED_TRACE(type_->ToString()); + auto input = ChunkedArrayFromJSON(type_, {"[[0, null, 1], null]", "[[2, 3], []]"}); + auto expected = ChunkedArrayFromJSON(value_type_, {"[0, null, 1]", "[2, 3]"}); CheckVectorUnary("list_flatten", input, expected); ARROW_SCOPED_TRACE("empty"); - input = ChunkedArrayFromJSON(ty, {}); - expected = ChunkedArrayFromJSON(int16(), {}); + input = ChunkedArrayFromJSON(type_, {}); + expected = ChunkedArrayFromJSON(value_type_, {}); CheckVectorUnary("list_flatten", input, expected); } + + void TestListFlattenRecursively() { + auto inner_type = std::make_shared(value_type_); + type_ = std::make_shared(inner_type); + + ListFlattenOptions opts; + opts.recursive = true; + + // List types with two nesting levels: list> + auto input = ArrayFromJSON(type_, R"([ + [[0, 1, 2], null, [3, null]], + [null], + [[2, 9], [4], [], [6, 5]] + ])"); + auto expected = ArrayFromJSON(value_type_, "[0, 1, 2, 3, null, 2, 9, 4, 6, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // Empty nested list should flatten until non-list type is reached + input = ArrayFromJSON(type_, R"([null])"); + expected = ArrayFromJSON(value_type_, "[]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + + // List types with three nesting levels: list>> + type_ = std::make_shared(std::make_shared(fixed_size_list(value_type_, 2))); + input = ArrayFromJSON(type_, R"([ + [ + [[null, 0]], + [[3, 7], null] + ], + [ + [[4, null], [5, 8]], + [[8, null]], + null + ], + [ + null + ] + ])"); + expected = ArrayFromJSON(value_type_, "[null, 0, 3, 7, 4, null, 5, 8, 8, null]"); + CheckVectorUnary("list_flatten", input, expected, &opts); + } + + protected: + std::shared_ptr type_; + std::shared_ptr value_type_; +}; + +TYPED_TEST_SUITE(TestVectorNestedSpecialized, ListAndListViewTypes); + +TYPED_TEST(TestVectorNestedSpecialized, ListFlatten) { this->TestListFlatten(); } + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenNulls) { + this->TestListFlattenNulls(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenChunkedArray) { + this->TestListFlattenChunkedArray(); +} + +TYPED_TEST(TestVectorNestedSpecialized, ListFlattenRecursively) { + this->TestListFlattenRecursively(); } TEST(TestVectorNested, ListFlattenFixedSizeList) { @@ -92,6 +168,21 @@ TEST(TestVectorNested, ListFlattenFixedSizeListNulls) { CheckVectorUnary("list_flatten", input, expected); } +TEST(TestVectorNested, ListFlattenFixedSizeListRecursively) { + ListFlattenOptions opts; + opts.recursive = true; + + auto inner_type = fixed_size_list(int32(), 2); + auto type = fixed_size_list(inner_type, 2); + auto input = ArrayFromJSON(type, R"([ + [[0, 1], [null, 3]], + [[7, null], [2, 5]], + [null, null] + ])"); + auto expected = ArrayFromJSON(int32(), "[0, 1, null, 3, 7, null, 2, 5]"); + CheckVectorUnary("list_flatten", input, expected, &opts); +} + TEST(TestVectorNested, ListParentIndices) { for (auto ty : {list(int16()), large_list(int16())}) { auto input = ArrayFromJSON(ty, "[[0, null, 1], null, [2, 3], [], [4, 5]]"); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 8825d697fdf77..5e24331fe96f2 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -22,6 +22,7 @@ #include #include +#include "arrow/array/concatenate.h" #include "arrow/array/data.h" #include "arrow/buffer_builder.h" #include "arrow/chunked_array.h" @@ -40,6 +41,7 @@ #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/fixed_width_internal.h" namespace arrow { @@ -158,9 +160,11 @@ class PrimitiveFilterImpl { PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection, ArrayData* out_arr) - : byte_width_(values.type->byte_width()), + : byte_width_(util::FixedWidthInBytes(*values.type)), values_is_valid_(values.buffers[0].data), - values_data_(values.buffers[1].data), + // No offset applied for boolean because it's a bitmap + values_data_(kIsBoolean ? values.buffers[1].data + : util::OffsetPointerOfFixedByteWidthValues(values)), values_null_count_(values.null_count), values_offset_(values.offset), values_length_(values.length), @@ -169,17 +173,13 @@ class PrimitiveFilterImpl { if constexpr (kByteWidth >= 0 && !kIsBoolean) { DCHECK_EQ(kByteWidth, byte_width_); } - if constexpr (!kIsBoolean) { - // No offset applied for boolean because it's a bitmap - values_data_ += values.offset * byte_width(); - } + DCHECK_EQ(out_arr->offset, 0); if (out_arr->buffers[0] != nullptr) { // May be unallocated if neither filter nor values contain nulls out_is_valid_ = out_arr->buffers[0]->mutable_data(); } - out_data_ = out_arr->buffers[1]->mutable_data(); - DCHECK_EQ(out_arr->offset, 0); + out_data_ = util::MutableFixedWidthValuesPointer(out_arr); out_length_ = out_arr->length; out_position_ = 0; } @@ -416,7 +416,7 @@ class PrimitiveFilterImpl { out_position_ += length; } - constexpr int32_t byte_width() const { + constexpr int64_t byte_width() const { if constexpr (kByteWidth >= 0) { return kByteWidth; } else { @@ -425,7 +425,7 @@ class PrimitiveFilterImpl { } private: - int32_t byte_width_; + int64_t byte_width_; const uint8_t* values_is_valid_; const uint8_t* values_data_; int64_t values_null_count_; @@ -439,6 +439,8 @@ class PrimitiveFilterImpl { int64_t out_position_; }; +} // namespace + Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const ArraySpan& values = batch[0].array; const ArraySpan& filter = batch[1].array; @@ -468,9 +470,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult // validity bitmap. const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; - const int bit_width = values.type->bit_width(); - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width, - allocate_validity, out_arr)); + DCHECK(util::IsFixedWidthLike(values)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, output_length, /*source=*/values, allocate_validity, out_arr)); switch (bit_width) { case 1: @@ -505,6 +508,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult return Status::OK(); } +namespace { + // ---------------------------------------------------------------------- // Optimized filter for base binary types (32-bit and 64-bit) @@ -924,12 +929,26 @@ Result> FilterRecordBatch(const RecordBatch& batch, return Status::Invalid("Filter inputs must all be the same length"); } - // Convert filter to selection vector/indices and use Take + // Fetch filter const auto& filter_opts = *static_cast(options); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr indices, - GetTakeIndices(*filter.array(), filter_opts.null_selection_behavior, - ctx->memory_pool())); + ArrayData filter_array; + switch (filter.kind()) { + case Datum::ARRAY: + filter_array = *filter.array(); + break; + case Datum::CHUNKED_ARRAY: { + ARROW_ASSIGN_OR_RAISE(auto combined, Concatenate(filter.chunked_array()->chunks())); + filter_array = *combined->data(); + break; + } + default: + return Status::TypeError("Filter should be array-like"); + } + + // Convert filter to selection vector/indices and use Take + ARROW_ASSIGN_OR_RAISE(std::shared_ptr indices, + GetTakeIndices(filter_array, filter_opts.null_selection_behavior, + ctx->memory_pool())); std::vector> columns(batch.num_columns()); for (int i = 0; i < batch.num_columns(); ++i) { ARROW_ASSIGN_OR_RAISE(Datum out, Take(batch.column(i)->data(), Datum(indices), @@ -1038,7 +1057,6 @@ class FilterMetaFunction : public MetaFunction { } if (args[0].kind() == Datum::RECORD_BATCH) { - auto values_batch = args[0].record_batch(); ARROW_ASSIGN_OR_RAISE( std::shared_ptr out_batch, FilterRecordBatch(*args[0].record_batch(), args[1], options, ctx)); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index a0fe2808e3e4e..2ba660e49ac38 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" @@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, DCHECK_OK(registry->AddFunction(std::move(func))); } -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out) { - // Preallocate memory - out->length = length; - out->buffers.resize(2); - - if (allocate_validity) { - ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); - } - if (bit_width == 1) { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); - } else { - ARROW_ASSIGN_OR_RAISE(out->buffers[1], - ctx->Allocate(bit_util::BytesForBits(length * bit_width))); - } - return Status::OK(); -} - namespace { /// \brief Iterate over a REE filter, emitting ranges of a plain values array that @@ -909,6 +892,20 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult } Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveFilterExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_bool_and_dictionary=*/true)) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec + // might not handle it correctly. + if (byte_width > 0) { + return PrimitiveFilterExec(ctx, batch, out); + } + } return FilterExec(ctx, batch, out); } @@ -968,6 +965,29 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* } Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const ArraySpan& values = batch[0].array; + + // If a FixedSizeList wraps a fixed-width type we can, in some cases, use + // PrimitiveTakeExec for a fixed-size list array. + if (util::IsFixedWidthLike(values, + /*force_null_count=*/true, + /*exclude_bool_and_dictionary=*/true)) { + const auto byte_width = util::FixedWidthInBytes(*values.type); + // Additionally, PrimitiveTakeExec is only implemented for specific byte widths. + // TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type. + switch (byte_width) { + case 1: + case 2: + case 4: + case 8: + case 16: + case 32: + return PrimitiveTakeExec(ctx, batch, out); + default: + break; // fallback to TakeExec + } + } + return TakeExec(ctx, batch, out); } diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h index 95f3e51cd67e3..a169f4b38a2b8 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h @@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc, const FunctionOptions* default_options, FunctionRegistry* registry); -/// \brief Allocate an ArrayData for a primitive array with a given length and bit width -/// -/// \param[in] bit_width 1 or a multiple of 8 -Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width, - bool allocate_validity, ArrayData* out); - /// \brief Callback type for VisitPlainxREEFilterOutputSegments. /// /// position is the logical position in the values array relative to its offset. @@ -70,6 +64,7 @@ void VisitPlainxREEFilterOutputSegments( FilterOptions::NullSelectionBehavior null_selection, const EmitREEFilterSegment& emit_segment); +Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*); Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 5cd3710828485..1a9af0efcd700 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -37,6 +37,7 @@ #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" +#include "arrow/util/fixed_width_internal.h" #include "arrow/util/int_util.h" #include "arrow/util/ree_util.h" @@ -323,7 +324,7 @@ namespace { using TakeState = OptionsWrapper; // ---------------------------------------------------------------------- -// Implement optimized take for primitive types from boolean to 1/2/4/8-byte +// Implement optimized take for primitive types from boolean to 1/2/4/8/16/32-byte // C-type based types. Use common implementation for every byte width and only // generate code for unsigned integer indices, since after boundschecking to // check for negative numbers in the indices we can safely reinterpret_cast @@ -333,16 +334,20 @@ using TakeState = OptionsWrapper; /// use the logical Arrow type but rather the physical C type. This way we /// only generate one take function for each byte width. /// -/// This function assumes that the indices have been boundschecked. +/// Also note that this function can also handle fixed-size-list arrays if +/// they fit the criteria described in fixed_width_internal.h, so use the +/// function defined in that file to access values and destination pointers +/// and DO NOT ASSUME `values.type()` is a primitive type. +/// +/// \pre the indices have been boundschecked template struct PrimitiveTakeImpl { static constexpr int kValueWidth = ValueWidthConstant::value; static void Exec(const ArraySpan& values, const ArraySpan& indices, ArrayData* out_arr) { - DCHECK_EQ(values.type->byte_width(), kValueWidth); - const auto* values_data = - values.GetValues(1, 0) + kValueWidth * values.offset; + DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth); + const auto* values_data = util::OffsetPointerOfFixedByteWidthValues(values); const uint8_t* values_is_valid = values.buffers[0].data; auto values_offset = values.offset; @@ -350,16 +355,15 @@ struct PrimitiveTakeImpl { const uint8_t* indices_is_valid = indices.buffers[0].data; auto indices_offset = indices.offset; - auto out = out_arr->GetMutableValues(1, 0) + kValueWidth * out_arr->offset; + DCHECK_EQ(out_arr->offset, 0); + auto* out = util::MutableFixedWidthValuesPointer(out_arr); auto out_is_valid = out_arr->buffers[0]->mutable_data(); - auto out_offset = out_arr->offset; - DCHECK_EQ(out_offset, 0); // If either the values or indices have nulls, we preemptively zero out the // out validity bitmap so that we don't have to use ClearBit in each // iteration for nulls. if (values.null_count != 0 || indices.null_count != 0) { - bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false); + bit_util::SetBitsTo(out_is_valid, 0, indices.length, false); } auto WriteValue = [&](int64_t position) { @@ -386,7 +390,7 @@ struct PrimitiveTakeImpl { valid_count += block.popcount; if (block.popcount == block.length) { // Fastest path: neither values nor index nulls - bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true); + bit_util::SetBitsTo(out_is_valid, position, block.length, true); for (int64_t i = 0; i < block.length; ++i) { WriteValue(position); ++position; @@ -396,7 +400,7 @@ struct PrimitiveTakeImpl { for (int64_t i = 0; i < block.length; ++i) { if (bit_util::GetBit(indices_is_valid, indices_offset + position)) { // index is not null - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); WriteValue(position); } else { WriteZero(position); @@ -416,7 +420,7 @@ struct PrimitiveTakeImpl { values_offset + indices_data[position])) { // value is not null WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); ++valid_count; } else { WriteZero(position); @@ -433,7 +437,7 @@ struct PrimitiveTakeImpl { values_offset + indices_data[position])) { // index is not null && value is not null WriteValue(position); - bit_util::SetBit(out_is_valid, out_offset + position); + bit_util::SetBit(out_is_valid, position); ++valid_count; } else { WriteZero(position); @@ -584,14 +588,16 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ArrayData* out_arr = out->array_data().get(); - const int bit_width = values.type->bit_width(); + DCHECK(util::IsFixedWidthLike(values)); + const int64_t bit_width = util::FixedWidthInBits(*values.type); // TODO: When neither values nor indices contain nulls, we can skip // allocating the validity bitmap altogether and save time and space. A // streamlined PrimitiveTakeImpl would need to be written that skips all // interactions with the output validity bitmap, though. - RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, bit_width, - /*allocate_validity=*/true, out_arr)); + RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( + ctx, indices.length, /*source=*/values, + /*allocate_validity=*/true, out_arr)); switch (bit_width) { case 1: TakeIndexDispatch(values, indices, out_arr); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc index ec94b328ea361..4c7d85b103f36 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc @@ -23,6 +23,7 @@ #include #include +#include "arrow/array/builder_nested.h" #include "arrow/array/concatenate.h" #include "arrow/chunked_array.h" #include "arrow/compute/api.h" @@ -32,6 +33,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" #include "arrow/testing/util.h" +#include "arrow/util/fixed_width_test_util.h" #include "arrow/util/logging.h" namespace arrow { @@ -726,7 +728,37 @@ TEST_F(TestFilterKernelWithLargeList, FilterListInt32) { "[[1,2], null, null]"); } -class TestFilterKernelWithFixedSizeList : public TestFilterKernel {}; +class TestFilterKernelWithFixedSizeList : public TestFilterKernel { + protected: + std::vector> five_length_filters_ = { + ArrayFromJSON(boolean(), "[false, false, false, false, false]"), + ArrayFromJSON(boolean(), "[true, true, true, true, true]"), + ArrayFromJSON(boolean(), "[false, true, true, false, true]"), + ArrayFromJSON(boolean(), "[null, true, null, false, true]"), + }; + + void AssertFilterOnNestedLists(const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + using NLG = ::arrow::util::internal::NestedListGenerator; + constexpr int64_t kLength = 5; + // Create two equivalent lists: one as a FixedSizeList and another as a List. + ASSERT_OK_AND_ASSIGN(auto fsl_list, + NLG::NestedFSLArray(inner_type, list_sizes, kLength)); + ASSERT_OK_AND_ASSIGN(auto list, + NLG::NestedListArray(inner_type, list_sizes, kLength)); + + ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`"); + + for (auto& filter : five_length_filters_) { + // Use the Filter on ListType as the reference implementation. + ASSERT_OK_AND_ASSIGN(auto expected_list, + Filter(*list, *filter, /*options=*/emit_null_)); + ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(expected_list, fsl_list->type())); + auto expected_fsl_array = expected_fsl.make_array(); + this->AssertFilter(fsl_list, filter, expected_fsl_array); + } + } +}; TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; @@ -740,6 +772,33 @@ TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) { "[[1, null, 3], [7, 8, null]]"); } +TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListVarWidth) { + std::string list_json = + R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])"; + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 0, 0]", "[]"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 1, null]", + R"([["two", "", "three"], ["four", "five", "six"], null])"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 1, null]", + R"([["four", "five", "six"], null])"); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[1, 1, 1, 1]", list_json); + this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 0, 1]", + R"([["two", "", "three"], ["seven", "eight", ""]])"); +} + +TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListModuloNesting) { + using NLG = ::arrow::util::internal::NestedListGenerator; + const std::vector> value_types = { + int16(), + int32(), + int64(), + }; + NLG::VisitAllNestedListConfigurations( + value_types, [this](const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + this->AssertFilterOnNestedLists(inner_type, list_sizes); + }); +} + class TestFilterKernelWithMap : public TestFilterKernel {}; TEST_F(TestFilterKernelWithMap, FilterMapStringToInt32) { @@ -1034,29 +1093,34 @@ Status TakeJSON(const std::shared_ptr& type, const std::string& values .Value(out); } +void DoCheckTake(const std::shared_ptr& values, + const std::shared_ptr& indices, + const std::shared_ptr& expected) { + AssertTakeArrays(values, indices, expected); + + // Check sliced values + ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(values->type(), 2)); + ASSERT_OK_AND_ASSIGN(auto values_sliced, + Concatenate({values_filler, values, values_filler})); + values_sliced = values_sliced->Slice(2, values->length()); + AssertTakeArrays(values_sliced, indices, expected); + + // Check sliced indices + ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(indices->type(), int8_t{0})); + ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3)); + ASSERT_OK_AND_ASSIGN(auto indices_sliced, + Concatenate({indices_filler, indices, indices_filler})); + indices_sliced = indices_sliced->Slice(3, indices->length()); + AssertTakeArrays(values, indices_sliced, expected); +} + void CheckTake(const std::shared_ptr& type, const std::string& values_json, const std::string& indices_json, const std::string& expected_json) { auto values = ArrayFromJSON(type, values_json); auto expected = ArrayFromJSON(type, expected_json); - for (auto index_type : {int8(), uint32()}) { auto indices = ArrayFromJSON(index_type, indices_json); - AssertTakeArrays(values, indices, expected); - - // Check sliced values - ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(type, 2)); - ASSERT_OK_AND_ASSIGN(auto values_sliced, - Concatenate({values_filler, values, values_filler})); - values_sliced = values_sliced->Slice(2, values->length()); - AssertTakeArrays(values_sliced, indices, expected); - - // Check sliced indices - ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(index_type, int8_t{0})); - ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3)); - ASSERT_OK_AND_ASSIGN(auto indices_sliced, - Concatenate({indices_filler, indices, indices_filler})); - indices_sliced = indices_sliced->Slice(3, indices->length()); - AssertTakeArrays(values, indices_sliced, expected); + DoCheckTake(values, indices, expected); } } @@ -1427,7 +1491,25 @@ TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) { CheckTake(large_list(int32()), list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]"); } -class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped {}; +class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped { + protected: + void CheckTakeOnNestedLists(const std::shared_ptr& inner_type, + const std::vector& list_sizes, int64_t length) { + using NLG = ::arrow::util::internal::NestedListGenerator; + // Create two equivalent lists: one as a FixedSizeList and another as a List. + ASSERT_OK_AND_ASSIGN(auto fsl_list, + NLG::NestedFSLArray(inner_type, list_sizes, length)); + ASSERT_OK_AND_ASSIGN(auto list, NLG::NestedListArray(inner_type, list_sizes, length)); + + ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`"); + + auto indices = ArrayFromJSON(int64(), "[1, 2, 4]"); + // Use the Take on ListType as the reference implementation. + ASSERT_OK_AND_ASSIGN(auto expected_list, Take(*list, *indices)); + ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(*expected_list, fsl_list->type())); + DoCheckTake(fsl_list, indices, expected_fsl); + } +}; TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]"; @@ -1449,6 +1531,42 @@ TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) { "[0, 1, 0]"); } +TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListVarWidth) { + std::string list_json = + R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])"; + CheckTake(fixed_size_list(utf8(), 3), list_json, "[]", "[]"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[3, 2, 1]", + R"([["seven", "eight", ""], ["four", "five", "six"], ["two", "", "three"]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[null, 2, 0]", + R"([null, ["four", "five", "six"], ["zero", "one", ""]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, R"([null, null])", "[null, null]"); + CheckTake( + fixed_size_list(utf8(), 3), list_json, "[3, 0, 0,3]", + R"([["seven", "eight", ""], ["zero", "one", ""], ["zero", "one", ""], ["seven", "eight", ""]])"); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[0, 1, 2, 3]", list_json); + CheckTake(fixed_size_list(utf8(), 3), list_json, "[2, 2, 2, 2, 2, 2, 1]", + R"([ + ["four", "five", "six"], ["four", "five", "six"], + ["four", "five", "six"], ["four", "five", "six"], + ["four", "five", "six"], ["four", "five", "six"], + ["two", "", "three"] + ])"); +} + +TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListModuloNesting) { + using NLG = ::arrow::util::internal::NestedListGenerator; + const std::vector> value_types = { + int16(), + int32(), + int64(), + }; + NLG::VisitAllNestedListConfigurations( + value_types, [this](const std::shared_ptr& inner_type, + const std::vector& list_sizes) { + this->CheckTakeOnNestedLists(inner_type, list_sizes, /*length=*/5); + }); +} + class TestTakeKernelWithMap : public TestTakeKernelTyped {}; TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) { diff --git a/cpp/src/arrow/compute/kernels/vector_sort.cc b/cpp/src/arrow/compute/kernels/vector_sort.cc index db2023ef04cad..ad22fa8d365c4 100644 --- a/cpp/src/arrow/compute/kernels/vector_sort.cc +++ b/cpp/src/arrow/compute/kernels/vector_sort.cc @@ -747,15 +747,13 @@ class TableSorter { auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, [&](uint64_t left, uint64_t right) { // First column is either null or nan - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); auto chunk_left = first_sort_key.GetChunk(left_loc); auto chunk_right = first_sort_key.GetChunk(right_loc); const auto left_is_null = chunk_left.IsNull(); @@ -786,15 +784,13 @@ class TableSorter { // Untyped implementation auto& comparator = comparator_; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(nulls_begin, nulls_middle, nulls_middle, nulls_end, temp_indices, [&](uint64_t left, uint64_t right) { // First column is always null - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); return comparator.Compare(left_loc, right_loc, 1); }); // Copy back temp area into main buffer @@ -812,15 +808,13 @@ class TableSorter { auto& comparator = comparator_; const auto& first_sort_key = sort_keys_[0]; - ChunkLocation left_loc{0, 0}; - ChunkLocation right_loc{0, 0}; + ChunkLocation left_loc; + ChunkLocation right_loc; std::merge(range_begin, range_middle, range_middle, range_end, temp_indices, [&](uint64_t left, uint64_t right) { // Both values are never null nor NaN. - left_loc = - left_resolver_.ResolveWithChunkIndexHint(left, /*hint=*/left_loc); - right_loc = - right_resolver_.ResolveWithChunkIndexHint(right, /*hint=*/right_loc); + left_loc = left_resolver_.ResolveWithHint(left, /*hint=*/left_loc); + right_loc = right_resolver_.ResolveWithHint(right, /*hint=*/right_loc); auto chunk_left = first_sort_key.GetChunk(left_loc); auto chunk_right = first_sort_key.GetChunk(right_loc); DCHECK(!chunk_left.IsNull()); diff --git a/cpp/src/arrow/compute/key_hash_internal.h b/cpp/src/arrow/compute/key_hash_internal.h index 7d226f52086b1..1f25beb0e1622 100644 --- a/cpp/src/arrow/compute/key_hash_internal.h +++ b/cpp/src/arrow/compute/key_hash_internal.h @@ -48,6 +48,16 @@ class ARROW_EXPORT Hashing32 { static void HashMultiColumn(const std::vector& cols, LightContext* ctx, uint32_t* out_hash); + // Clarify the max temp stack usage for HashBatch, which might be necessary for the + // caller to be aware of at compile time to reserve enough stack size in advance. The + // HashBatch implementation uses one uint32 temp vector as a buffer for hash, one uint16 + // temp vector as a buffer for null indices and one uint32 temp vector as a buffer for + // null hash, all are of size kMiniBatchLength. Plus extra kMiniBatchLength to cope with + // stack padding and aligning. + static constexpr auto kHashBatchTempStackUsage = + (sizeof(uint32_t) + sizeof(uint16_t) + sizeof(uint32_t) + /*extra=*/1) * + util::MiniBatch::kMiniBatchLength; + static Status HashBatch(const ExecBatch& key_batch, uint32_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, @@ -161,6 +171,15 @@ class ARROW_EXPORT Hashing64 { static void HashMultiColumn(const std::vector& cols, LightContext* ctx, uint64_t* hashes); + // Clarify the max temp stack usage for HashBatch, which might be necessary for the + // caller to be aware of at compile time to reserve enough stack size in advance. The + // HashBatch implementation uses one uint16 temp vector as a buffer for null indices and + // one uint64 temp vector as a buffer for null hash, all are of size kMiniBatchLength. + // Plus extra kMiniBatchLength to cope with stack padding and aligning. + static constexpr auto kHashBatchTempStackUsage = + (sizeof(uint16_t) + sizeof(uint64_t) + /*extra=*/1) * + util::MiniBatch::kMiniBatchLength; + static Status HashBatch(const ExecBatch& key_batch, uint64_t* hashes, std::vector& column_arrays, int64_t hardware_flags, util::TempVectorStack* temp_stack, diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index 4e5d869cb7db6..fdf6d2125850a 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -25,12 +25,16 @@ #include "arrow/array/builder_binary.h" #include "arrow/compute/key_hash_internal.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" #include "arrow/testing/util.h" #include "arrow/util/cpu_info.h" #include "arrow/util/pcg_random.h" namespace arrow { +using arrow::random::RandomArrayGenerator; +using arrow::util::MiniBatch; +using arrow::util::TempVectorStack; using internal::checked_pointer_cast; using internal::CpuInfo; @@ -156,7 +160,7 @@ class TestVectorHash { std::vector temp_buffer; temp_buffer.resize(mini_batch_size * 4); - for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { const auto hardware_flags = hardware_flags_for_testing[i]; if (use_32bit_hash) { if (!use_varlen_input) { @@ -192,7 +196,7 @@ class TestVectorHash { // Verify that all implementations (scalar, SIMD) give the same hashes // const auto& hashes_scalar64 = hashes64[0]; - for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { for (int j = 0; j < num_rows; ++j) { ASSERT_EQ(hashes64[i][j], hashes_scalar64[j]) << "scalar and simd approaches yielded different hashes"; @@ -280,7 +284,7 @@ void HashFixedLengthFrom(int key_length, int num_rows, int start_row) { std::vector temp_buffer; temp_buffer.resize(mini_batch_size * 4); - for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { const auto hardware_flags = hardware_flags_for_testing[i]; Hashing32::HashFixed(hardware_flags, /*combine_hashes=*/false, num_rows_to_hash, key_length, @@ -292,7 +296,7 @@ void HashFixedLengthFrom(int key_length, int num_rows, int start_row) { } // Verify that all implementations (scalar, SIMD) give the same hashes. - for (int i = 1; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (size_t i = 1; i < hardware_flags_for_testing.size(); ++i) { for (int j = 0; j < num_rows_to_hash; ++j) { ASSERT_EQ(hashes32[i][j], hashes32[0][j]) << "scalar and simd approaches yielded different 32-bit hashes"; @@ -311,5 +315,52 @@ TEST(VectorHash, FixedLengthTailByteSafety) { HashFixedLengthFrom(/*key_length=*/19, /*num_rows=*/64, /*start_row=*/63); } +// Make sure that Hashing32/64::HashBatch uses no more stack space than declared in +// Hashing32/64::kHashBatchTempStackUsage. +TEST(VectorHash, HashBatchTempStackUsage) { + for (auto num_rows : + {0, 1, MiniBatch::kMiniBatchLength, MiniBatch::kMiniBatchLength * 64}) { + SCOPED_TRACE("num_rows = " + std::to_string(num_rows)); + + MemoryPool* pool = default_memory_pool(); + RandomArrayGenerator gen(42); + + auto column = gen.Int8(num_rows, 0, 127); + ExecBatch batch({column}, num_rows); + + std::vector column_arrays; + ASSERT_OK(ColumnArraysFromExecBatch(batch, &column_arrays)); + + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + + { + std::vector hashes(num_rows); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, Hashing32::kHashBatchTempStackUsage)); + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { + SCOPED_TRACE("hashing32 for hardware flags = " + + std::to_string(hardware_flags_for_testing[i])); + ASSERT_OK(Hashing32::HashBatch(batch, hashes.data(), column_arrays, + hardware_flags_for_testing[i], &stack, + /*start_rows=*/0, num_rows)); + } + } + + { + std::vector hashes(num_rows); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, Hashing64::kHashBatchTempStackUsage)); + for (size_t i = 0; i < hardware_flags_for_testing.size(); ++i) { + SCOPED_TRACE("hashing64 for hardware flags = " + + std::to_string(hardware_flags_for_testing[i])); + ASSERT_OK(Hashing64::HashBatch(batch, hashes.data(), column_arrays, + hardware_flags_for_testing[i], &stack, + /*start_rows=*/0, num_rows)); + } + } + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/key_map_internal.h b/cpp/src/arrow/compute/key_map_internal.h index 8e06dc83483aa..a5e784a9e4463 100644 --- a/cpp/src/arrow/compute/key_map_internal.h +++ b/cpp/src/arrow/compute/key_map_internal.h @@ -21,6 +21,7 @@ #include #include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h index 67de71bf56c92..995c4211998e0 100644 --- a/cpp/src/arrow/compute/light_array_internal.h +++ b/cpp/src/arrow/compute/light_array_internal.h @@ -22,6 +22,7 @@ #include "arrow/array.h" #include "arrow/compute/exec.h" #include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/type.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 08f36ee606025..cc02d489d138f 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -20,6 +20,7 @@ #include #include +#include "arrow/memory_pool.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 078a8287c71c0..98aea9011266c 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -36,22 +36,22 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order) { + bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return; } uint32_t num_processed = 0; #if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { - num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + num_processed = NullUpdateColumnToRow_avx2( + use_selection, id_col, num_rows_to_compare, sel_left_maybe_null, + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, match_bytevector); } #endif - uint32_t null_bit_id = - are_cols_in_encoding_order ? id_col : rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -363,10 +363,9 @@ void KeyCompare::CompareColumnsToRows( continue; } - uint32_t offset_within_row = rows.metadata().encoded_field_offset( - are_cols_in_encoding_order - ? static_cast(icol) - : rows.metadata().pos_after_encoding(static_cast(icol))); + uint32_t offset_within_row = + rows.metadata().encoded_field_offset(ColIdInEncodingOrder( + rows, static_cast(icol), are_cols_in_encoding_order)); if (col.metadata().is_fixed_length) { if (sel_left_maybe_null) { CompareBinaryColumnToRow( @@ -375,9 +374,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { // Version without using selection vector CompareBinaryColumnToRow( @@ -386,9 +384,8 @@ void KeyCompare::CompareColumnsToRows( is_first_column ? match_bytevector_A : match_bytevector_B); NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); @@ -414,9 +411,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } else { if (ivarbinary == 0) { CompareVarBinaryColumnToRow( @@ -429,9 +425,8 @@ void KeyCompare::CompareColumnsToRows( } NullUpdateColumnToRow( static_cast(icol), num_rows_to_compare, sel_left_maybe_null, - left_to_right_map, ctx, col, rows, - is_first_column ? match_bytevector_A : match_bytevector_B, - are_cols_in_encoding_order); + left_to_right_map, ctx, col, rows, are_cols_in_encoding_order, + is_first_column ? match_bytevector_A : match_bytevector_B); } if (!is_first_column) { AndByteVectors(ctx, num_rows_to_compare, match_bytevector_A, match_bytevector_B); diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index b039ca97ff978..a5a109b0b516a 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -32,6 +32,16 @@ namespace compute { class ARROW_EXPORT KeyCompare { public: + // Clarify the max temp stack usage for CompareColumnsToRows, which might be necessary + // for the caller to be aware of (possibly at compile time) to reserve enough stack size + // in advance. The CompareColumnsToRows implementation uses three uint8 temp vectors as + // buffers for match vectors, all are of size num_rows. Plus extra kMiniBatchLength to + // cope with stack padding and aligning. + constexpr static int64_t CompareColumnsToRowsTempStackUsage(int64_t num_rows) { + return (sizeof(uint8_t) + sizeof(uint8_t) + sizeof(uint8_t)) * num_rows + + /*extra=*/util::MiniBatch::kMiniBatchLength; + } + // Returns a single 16-bit selection vector of rows that failed comparison. // If there is input selection on the left, the resulting selection is a filtered image // of input selection. @@ -43,13 +53,19 @@ class ARROW_EXPORT KeyCompare { uint8_t* out_match_bitvector_maybe_null = NULLPTR); private: + static uint32_t ColIdInEncodingOrder(const RowTableImpl& rows, uint32_t id_col, + bool are_cols_in_encoding_order) { + return are_cols_in_encoding_order ? id_col + : rows.metadata().pos_after_encoding(id_col); + } + template static void NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector, - bool are_cols_in_encoding_order); + bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static void CompareBinaryColumnToRowHelper( @@ -92,7 +108,8 @@ class ARROW_EXPORT KeyCompare { static uint32_t NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector); + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector); template static uint32_t CompareBinaryColumnToRowHelper_avx2( @@ -118,13 +135,11 @@ class ARROW_EXPORT KeyCompare { static uint32_t AndByteVectors_avx2(uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B); - static uint32_t NullUpdateColumnToRow_avx2(bool use_selection, uint32_t id_col, - uint32_t num_rows_to_compare, - const uint16_t* sel_left_maybe_null, - const uint32_t* left_to_right_map, - LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, - uint8_t* match_bytevector); + static uint32_t NullUpdateColumnToRow_avx2( + bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, + const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, + LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, + bool are_cols_in_encoding_order, uint8_t* match_bytevector); static uint32_t CompareBinaryColumnToRow_avx2( bool use_selection, uint32_t offset_within_row, uint32_t num_rows_to_compare, diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index ff407c51b83cb..18f656a2e458d 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -39,12 +39,14 @@ template uint32_t KeyCompare::NullUpdateColumnToRowImp_avx2( uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, - const RowTableImpl& rows, uint8_t* match_bytevector) { + const RowTableImpl& rows, bool are_cols_in_encoding_order, + uint8_t* match_bytevector) { if (!rows.has_any_nulls(ctx) && !col.data(0)) { return num_rows_to_compare; } - uint32_t null_bit_id = rows.metadata().pos_after_encoding(id_col); + const uint32_t null_bit_id = + ColIdInEncodingOrder(rows, id_col, are_cols_in_encoding_order); if (!col.data(0)) { // Remove rows from the result for which the column value is a null @@ -569,7 +571,7 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( bool use_selection, uint32_t id_col, uint32_t num_rows_to_compare, const uint16_t* sel_left_maybe_null, const uint32_t* left_to_right_map, LightContext* ctx, const KeyColumnArray& col, const RowTableImpl& rows, - uint8_t* match_bytevector) { + bool are_cols_in_encoding_order, uint8_t* match_bytevector) { int64_t num_rows_safe = TailSkipForSIMD::FixBitAccess(sizeof(uint32_t), col.length(), col.bit_offset(0)); if (sel_left_maybe_null) { @@ -580,13 +582,13 @@ uint32_t KeyCompare::NullUpdateColumnToRow_avx2( } if (use_selection) { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } else { - return NullUpdateColumnToRowImp_avx2(id_col, num_rows_to_compare, - sel_left_maybe_null, left_to_right_map, - ctx, col, rows, match_bytevector); + return NullUpdateColumnToRowImp_avx2( + id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, ctx, col, + rows, are_cols_in_encoding_order, match_bytevector); } } diff --git a/cpp/src/arrow/compute/row/compare_test.cc b/cpp/src/arrow/compute/row/compare_test.cc index 1d8562cd56d3c..4044049b10863 100644 --- a/cpp/src/arrow/compute/row/compare_test.cc +++ b/cpp/src/arrow/compute/row/compare_test.cc @@ -19,23 +19,26 @@ #include "arrow/compute/row/compare_internal.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" namespace arrow { namespace compute { using arrow::bit_util::BytesForBits; using arrow::internal::CpuInfo; +using arrow::random::RandomArrayGenerator; using arrow::util::MiniBatch; using arrow::util::TempVectorStack; // Specialized case for GH-39577. TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) { int fsb_length = 9; + int num_rows = 7; + MemoryPool* pool = default_memory_pool(); TempVectorStack stack; - ASSERT_OK(stack.Init(pool, 8 * MiniBatch::kMiniBatchLength * sizeof(uint64_t))); + ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows))); - int num_rows = 7; auto column_right = ArrayFromJSON(fixed_size_binary(fsb_length), R"([ "000000000", "111111111", @@ -106,5 +109,60 @@ TEST(KeyCompare, CompareColumnsToRowsCuriousFSB) { } } +// Make sure that KeyCompare::CompareColumnsToRows uses no more stack space than declared +// in KeyCompare::CompareColumnsToRowsTempStackUsage(). +TEST(KeyCompare, CompareColumnsToRowsTempStackUsage) { + for (auto num_rows : + {0, 1, MiniBatch::kMiniBatchLength, MiniBatch::kMiniBatchLength * 64}) { + SCOPED_TRACE("num_rows = " + std::to_string(num_rows)); + + MemoryPool* pool = default_memory_pool(); + TempVectorStack stack; + ASSERT_OK(stack.Init(pool, KeyCompare::CompareColumnsToRowsTempStackUsage(num_rows))); + + RandomArrayGenerator gen(42); + + auto column_right = gen.Int8(num_rows, 0, 127); + ExecBatch batch_right({column_right}, num_rows); + + std::vector column_metadatas_right; + ASSERT_OK(ColumnMetadatasFromExecBatch(batch_right, &column_metadatas_right)); + + RowTableMetadata table_metadata_right; + table_metadata_right.FromColumnMetadataVector(column_metadatas_right, + sizeof(uint64_t), sizeof(uint64_t)); + + std::vector column_arrays_right; + ASSERT_OK(ColumnArraysFromExecBatch(batch_right, &column_arrays_right)); + + RowTableImpl row_table; + ASSERT_OK(row_table.Init(pool, table_metadata_right)); + + RowTableEncoder row_encoder; + row_encoder.Init(column_metadatas_right, sizeof(uint64_t), sizeof(uint64_t)); + row_encoder.PrepareEncodeSelected(0, num_rows, column_arrays_right); + + std::vector row_ids_right(num_rows); + std::iota(row_ids_right.begin(), row_ids_right.end(), 0); + ASSERT_OK(row_encoder.EncodeSelected(&row_table, num_rows, row_ids_right.data())); + + auto column_left = gen.Int8(num_rows, 0, 127); + ExecBatch batch_left({column_left}, num_rows); + std::vector column_arrays_left; + ASSERT_OK(ColumnArraysFromExecBatch(batch_left, &column_arrays_left)); + + std::vector row_ids_left(num_rows); + std::iota(row_ids_left.begin(), row_ids_left.end(), 0); + + LightContext ctx{CpuInfo::GetInstance()->hardware_flags(), &stack}; + + uint32_t num_rows_no_match; + std::vector row_ids_out(num_rows); + KeyCompare::CompareColumnsToRows(num_rows, NULLPTR, row_ids_left.data(), &ctx, + &num_rows_no_match, row_ids_out.data(), + column_arrays_left, row_table, true, NULLPTR); + } +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index 756c70967ac6f..3ed5411d0ba02 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -217,18 +217,18 @@ struct SimpleKeySegmenter : public BaseRowSegmenter { struct AnyKeysSegmenter : public BaseRowSegmenter { static Result> Make( const std::vector& key_types, ExecContext* ctx) { - ARROW_RETURN_NOT_OK(Grouper::Make(key_types, ctx)); // check types - return std::make_unique(key_types, ctx); + ARROW_ASSIGN_OR_RAISE(auto grouper, Grouper::Make(key_types, ctx)); // check types + return std::make_unique(key_types, ctx, std::move(grouper)); } - AnyKeysSegmenter(const std::vector& key_types, ExecContext* ctx) + AnyKeysSegmenter(const std::vector& key_types, ExecContext* ctx, + std::unique_ptr grouper) : BaseRowSegmenter(key_types), - ctx_(ctx), - grouper_(nullptr), + grouper_(std::move(grouper)), save_group_id_(kNoGroupId) {} Status Reset() override { - grouper_ = nullptr; + ARROW_RETURN_NOT_OK(grouper_->Reset()); save_group_id_ = kNoGroupId; return Status::OK(); } @@ -245,7 +245,6 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { // first row of a new segment to see if it extends the previous segment. template Result MapGroupIdAt(const Batch& batch, int64_t offset) { - if (!grouper_) return kNoGroupId; ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch, offset, /*length=*/1)); if (!datum.is_array()) { @@ -264,9 +263,6 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { if (offset == batch.length) { return MakeSegment(batch.length, offset, 0, kEmptyExtends); } - // ARROW-18311: make Grouper support Reset() - // so it can be reset instead of recreated below - // // the group id must be computed prior to resetting the grouper, since it is compared // to save_group_id_, and after resetting the grouper produces incomparable group ids ARROW_ASSIGN_OR_RAISE(auto group_id, MapGroupIdAt(batch, offset)); @@ -276,7 +272,7 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { return extends; }; // resetting drops grouper's group-ids, freeing-up memory for the next segment - ARROW_ASSIGN_OR_RAISE(grouper_, Grouper::Make(key_types_, ctx_)); // TODO: reset it + ARROW_RETURN_NOT_OK(grouper_->Reset()); // GH-34475: cache the grouper-consume result across invocations of GetNextSegment ARROW_ASSIGN_OR_RAISE(auto datum, grouper_->Consume(batch, offset)); if (datum.is_array()) { @@ -299,7 +295,6 @@ struct AnyKeysSegmenter : public BaseRowSegmenter { } private: - ExecContext* const ctx_; std::unique_ptr grouper_; group_id_t save_group_id_; }; @@ -354,6 +349,7 @@ struct GrouperNoKeysImpl : Grouper { RETURN_NOT_OK(builder->Finish(&array)); return std::move(array); } + Status Reset() override { return Status::OK(); } Result Consume(const ExecSpan& batch, int64_t offset, int64_t length) override { ARROW_ASSIGN_OR_RAISE(auto array, MakeConstantGroupIdArray(length, 0)); return Datum(array); @@ -419,6 +415,14 @@ struct GrouperImpl : public Grouper { return std::move(impl); } + Status Reset() override { + map_.clear(); + offsets_.clear(); + key_bytes_.clear(); + num_groups_ = 0; + return Status::OK(); + } + Result Consume(const ExecSpan& batch, int64_t offset, int64_t length) override { ARROW_RETURN_NOT_OK(CheckAndCapLengthForConsume(batch.length, offset, &length)); if (offset != 0 || length != batch.length) { @@ -595,7 +599,18 @@ struct GrouperFastImpl : public Grouper { return std::move(impl); } - ~GrouperFastImpl() { map_.cleanup(); } + Status Reset() override { + ARROW_DCHECK_EQ(temp_stack_.AllocatedSize(), 0); + rows_.Clean(); + rows_minibatch_.Clean(); + map_.cleanup(); + RETURN_NOT_OK(map_.init(encode_ctx_.hardware_flags, ctx_->memory_pool())); + // TODO: It is now assumed that the dictionaries_ are identical to the first batch + // throughout the grouper's lifespan so no resetting is needed. But if we want to + // support different dictionaries for different batches, we need to reset the + // dictionaries_ here. + return Status::OK(); + } Result Consume(const ExecSpan& batch, int64_t offset, int64_t length) override { ARROW_RETURN_NOT_OK(CheckAndCapLengthForConsume(batch.length, offset, &length)); @@ -838,8 +853,7 @@ struct GrouperFastImpl : public Grouper { return out; } - static constexpr int log_minibatch_max_ = 10; - static constexpr int minibatch_size_max_ = 1 << log_minibatch_max_; + static constexpr int minibatch_size_max_ = arrow::util::MiniBatch::kMiniBatchLength; static constexpr int minibatch_size_min_ = 128; int minibatch_size_; diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index 628a9c14f3e44..a883fb938ddaf 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -109,6 +109,10 @@ class ARROW_EXPORT Grouper { static Result> Make(const std::vector& key_types, ExecContext* ctx = default_exec_context()); + /// Reset all intermediate state, make the grouper logically as just `Make`ed. + /// The underlying buffers, if any, may or may not be released though. + virtual Status Reset() = 0; + /// Consume a batch of keys, producing the corresponding group ids as an integer array, /// over a slice defined by an offset and length, which defaults to the batch length. /// Currently only uint32 indices will be produced, eventually the bit width will only diff --git a/cpp/src/arrow/compute/row/grouper_test.cc b/cpp/src/arrow/compute/row/grouper_test.cc new file mode 100644 index 0000000000000..1e853be5e4af7 --- /dev/null +++ b/cpp/src/arrow/compute/row/grouper_test.cc @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/compute/exec.h" +#include "arrow/compute/row/grouper.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/random.h" + +namespace arrow { +namespace compute { + +// Specialized case for GH-40997 +TEST(Grouper, ResortedColumnsWithLargeNullRows) { + const uint64_t num_rows = 1024; + + // construct random array with plenty of null values + const int32_t kSeed = 42; + const int32_t min = 0; + const int32_t max = 100; + const double null_probability = 0.3; + const double true_probability = 0.5; + auto rng = random::RandomArrayGenerator(kSeed); + auto b_arr = rng.Boolean(num_rows, true_probability, null_probability); + auto i32_arr = rng.Int32(num_rows, min, max, null_probability); + auto i64_arr = rng.Int64(num_rows, min, max * 10, null_probability); + + // construct batches with columns which will be resorted in the grouper make + std::vector exec_batches = {ExecBatch({i64_arr, i32_arr, b_arr}, num_rows), + ExecBatch({i32_arr, i64_arr, b_arr}, num_rows), + ExecBatch({i64_arr, b_arr, i32_arr}, num_rows), + ExecBatch({i32_arr, b_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i32_arr, i64_arr}, num_rows), + ExecBatch({b_arr, i64_arr, i32_arr}, num_rows)}; + + const int num_batches = static_cast(exec_batches.size()); + std::vector group_num_vec; + group_num_vec.reserve(num_batches); + + for (const auto& exec_batch : exec_batches) { + ExecSpan span(exec_batch); + ASSERT_OK_AND_ASSIGN(auto grouper, Grouper::Make(span.GetTypes())); + ASSERT_OK_AND_ASSIGN(Datum group_ids, grouper->Consume(span)); + group_num_vec.emplace_back(grouper->num_groups()); + } + + for (int i = 1; i < num_batches; i++) { + ASSERT_EQ(group_num_vec[i - 1], group_num_vec[i]); + } +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index f6a62c09fcf24..469205e9b008d 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -66,7 +66,8 @@ void RowTableMetadata::FromColumnMetadataVector( // // Columns are sorted based on the size in bytes of their fixed-length part. // For the varying-length column, the fixed-length part is the 32-bit field storing - // cumulative length of varying-length fields. + // cumulative length of varying-length fields. This is to make the memory access of + // each individual column within the encoded row alignment-friendly. // // The rules are: // diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index b0c863b26a062..b90b3a64056bd 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -17,11 +17,7 @@ #include "arrow/compute/util.h" -#include "arrow/table.h" -#include "arrow/util/bit_util.h" -#include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" -#include "arrow/util/tracing_internal.h" #include "arrow/util/ubsan.h" namespace arrow { @@ -31,33 +27,6 @@ using internal::CpuInfo; namespace util { -void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { - int64_t new_top = top_ + EstimatedAllocationSize(num_bytes); - // Stack overflow check (see GH-39582). - // XXX cannot return a regular Status because most consumers do not either. - ARROW_CHECK_LE(new_top, buffer_size_) << "TempVectorStack::alloc overflow"; - *data = buffer_->mutable_data() + top_ + sizeof(uint64_t); - // We set 8 bytes before the beginning of the allocated range and - // 8 bytes after the end to check for stack overflow (which would - // result in those known bytes being corrupted). - reinterpret_cast(buffer_->mutable_data() + top_)[0] = kGuard1; - reinterpret_cast(buffer_->mutable_data() + new_top)[-1] = kGuard2; - *id = num_vectors_++; - top_ = new_top; -} - -void TempVectorStack::release(int id, uint32_t num_bytes) { - ARROW_DCHECK(num_vectors_ == id + 1); - int64_t size = EstimatedAllocationSize(num_bytes); - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == - kGuard2); - ARROW_DCHECK(top_ >= size); - top_ -= size; - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == - kGuard1); - --num_vectors_; -} - namespace bit_util { inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 88dce160ce936..d56e398667f66 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -24,17 +24,10 @@ #include #include -#include "arrow/buffer.h" #include "arrow/compute/expression.h" #include "arrow/compute/type_fwd.h" -#include "arrow/memory_pool.h" #include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/util/bit_util.h" #include "arrow/util/cpu_info.h" -#include "arrow/util/mutex.h" -#include "arrow/util/thread_pool.h" -#include "arrow/util/type_fwd.h" #if defined(__clang__) || defined(__GNUC__) #define BYTESWAP(x) __builtin_bswap64(x) @@ -77,72 +70,6 @@ class MiniBatch { static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; }; -/// Storage used to allocate temporary vectors of a batch size. -/// Temporary vectors should resemble allocating temporary variables on the stack -/// but in the context of vectorized processing where we need to store a vector of -/// temporaries instead of a single value. -class ARROW_EXPORT TempVectorStack { - template - friend class TempVectorHolder; - - public: - Status Init(MemoryPool* pool, int64_t size) { - num_vectors_ = 0; - top_ = 0; - buffer_size_ = EstimatedAllocationSize(size); - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); - // Ensure later operations don't accidentally read uninitialized memory. - std::memset(buffer->mutable_data(), 0xFF, size); - buffer_ = std::move(buffer); - return Status::OK(); - } - - private: - static int64_t EstimatedAllocationSize(int64_t size) { - return PaddedAllocationSize(size) + 2 * sizeof(uint64_t); - } - - static int64_t PaddedAllocationSize(int64_t num_bytes) { - // Round up allocation size to multiple of 8 bytes - // to avoid returning temp vectors with unaligned address. - // - // Also add padding at the end to facilitate loads and stores - // using SIMD when number of vector elements is not divisible - // by the number of SIMD lanes. - // - return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; - } - void alloc(uint32_t num_bytes, uint8_t** data, int* id); - void release(int id, uint32_t num_bytes); - static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; - static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; - static constexpr int64_t kPadding = 64; - int num_vectors_; - int64_t top_; - std::unique_ptr buffer_; - int64_t buffer_size_; -}; - -template -class TempVectorHolder { - friend class TempVectorStack; - - public: - ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } - T* mutable_data() { return reinterpret_cast(data_); } - TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { - stack_ = stack; - num_elements_ = num_elements; - stack_->alloc(num_elements * sizeof(T), &data_, &id_); - } - - private: - TempVectorStack* stack_; - uint8_t* data_; - int id_; - uint32_t num_elements_; -}; - namespace bit_util { ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, diff --git a/cpp/src/arrow/compute/util_internal.cc b/cpp/src/arrow/compute/util_internal.cc new file mode 100644 index 0000000000000..cc26982fef110 --- /dev/null +++ b/cpp/src/arrow/compute/util_internal.cc @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/util_internal.h" + +#include "arrow/compute/util.h" +#include "arrow/memory_pool.h" + +namespace arrow { +namespace util { + +Status TempVectorStack::Init(MemoryPool* pool, int64_t size) { + num_vectors_ = 0; + top_ = 0; + buffer_size_ = EstimatedAllocationSize(size); + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); + // Ensure later operations don't accidentally read uninitialized memory. + std::memset(buffer->mutable_data(), 0xFF, size); + buffer_ = std::move(buffer); + return Status::OK(); +} + +int64_t TempVectorStack::PaddedAllocationSize(int64_t num_bytes) { + // Round up allocation size to multiple of 8 bytes + // to avoid returning temp vectors with unaligned address. + // + // Also add padding at the end to facilitate loads and stores + // using SIMD when number of vector elements is not divisible + // by the number of SIMD lanes. + // + return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; +} + +void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { + int64_t estimated_alloc_size = EstimatedAllocationSize(num_bytes); + int64_t new_top = top_ + estimated_alloc_size; + // Stack overflow check (see GH-39582). + // XXX cannot return a regular Status because most consumers do not either. + ARROW_CHECK_LE(new_top, buffer_size_) + << "TempVectorStack::alloc overflow: allocating " << estimated_alloc_size + << " on top of " << top_ << " in stack of size " << buffer_size_; + *data = buffer_->mutable_data() + top_ + sizeof(uint64_t); + // We set 8 bytes before the beginning of the allocated range and + // 8 bytes after the end to check for stack overflow (which would + // result in those known bytes being corrupted). + reinterpret_cast(buffer_->mutable_data() + top_)[0] = kGuard1; + reinterpret_cast(buffer_->mutable_data() + new_top)[-1] = kGuard2; + *id = num_vectors_++; + top_ = new_top; +} + +void TempVectorStack::release(int id, uint32_t num_bytes) { + ARROW_DCHECK(num_vectors_ == id + 1); + int64_t size = EstimatedAllocationSize(num_bytes); + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == + kGuard2); + ARROW_DCHECK(top_ >= size); + top_ -= size; + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == + kGuard1); + --num_vectors_; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h index 87e89a3350721..043ff118062e4 100644 --- a/cpp/src/arrow/compute/util_internal.h +++ b/cpp/src/arrow/compute/util_internal.h @@ -17,6 +17,8 @@ #pragma once +#include "arrow/status.h" +#include "arrow/type_fwd.h" #include "arrow/util/logging.h" namespace arrow { @@ -27,5 +29,56 @@ void CheckAlignment(const void* ptr) { ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); } +/// Storage used to allocate temporary vectors of a batch size. +/// Temporary vectors should resemble allocating temporary variables on the stack +/// but in the context of vectorized processing where we need to store a vector of +/// temporaries instead of a single value. +class ARROW_EXPORT TempVectorStack { + template + friend class TempVectorHolder; + + public: + Status Init(MemoryPool* pool, int64_t size); + + int64_t AllocatedSize() const { return top_; } + + private: + static int64_t EstimatedAllocationSize(int64_t size) { + return PaddedAllocationSize(size) + 2 * sizeof(uint64_t); + } + + static int64_t PaddedAllocationSize(int64_t num_bytes); + + void alloc(uint32_t num_bytes, uint8_t** data, int* id); + void release(int id, uint32_t num_bytes); + static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; + static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; + static constexpr int64_t kPadding = 64; + int num_vectors_; + int64_t top_; + std::unique_ptr buffer_; + int64_t buffer_size_; +}; + +template +class TempVectorHolder { + friend class TempVectorStack; + + public: + ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } + T* mutable_data() { return reinterpret_cast(data_); } + TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { + stack_ = stack; + num_elements_ = num_elements; + stack_->alloc(num_elements * sizeof(T), &data_, &id_); + } + + private: + TempVectorStack* stack_; + uint8_t* data_; + int id_; + uint32_t num_elements_; +}; + } // namespace util } // namespace arrow diff --git a/cpp/src/arrow/dataset/discovery_test.cc b/cpp/src/arrow/dataset/discovery_test.cc index 92cec7f324963..981146b7999ef 100644 --- a/cpp/src/arrow/dataset/discovery_test.cc +++ b/cpp/src/arrow/dataset/discovery_test.cc @@ -144,7 +144,8 @@ class FileSystemDatasetFactoryTest : public DatasetFactoryTest { } options_ = std::make_shared(); options_->dataset_schema = schema; - ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::Default(*schema)); + ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::Default( + *schema, options_->add_augmented_fields)); SetProjection(options_.get(), std::move(projection)); ASSERT_OK_AND_ASSIGN(dataset_, factory_->Finish(schema)); ASSERT_OK_AND_ASSIGN(auto fragment_it, dataset_->GetFragments()); diff --git a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc index 307017fd67e06..0287d593d12d3 100644 --- a/cpp/src/arrow/dataset/file_parquet_encryption_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_encryption_test.cc @@ -148,17 +148,22 @@ class DatasetEncryptionTestBase : public ::testing::Test { FileSystemDatasetFactory::Make(file_system_, selector, file_format, factory_options)); - // Read dataset into table + // Create the dataset ASSERT_OK_AND_ASSIGN(auto dataset, dataset_factory->Finish()); - ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); - ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); - ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable()); - - // Verify the data was read correctly - ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks()); - // Validate the table - ASSERT_OK(combined_table->ValidateFull()); - AssertTablesEqual(*combined_table, *table_); + + // Reuse the dataset above to scan it twice to make sure decryption works correctly. + for (size_t i = 0; i < 2; ++i) { + // Read dataset into table + ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan()); + ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish()); + ASSERT_OK_AND_ASSIGN(auto read_table, scanner->ToTable()); + + // Verify the data was read correctly + ASSERT_OK_AND_ASSIGN(auto combined_table, read_table->CombineChunks()); + // Validate the table + ASSERT_OK(combined_table->ValidateFull()); + AssertTablesEqual(*combined_table, *table_); + } } protected: diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 76cd0af3b835f..bf626826d4d1b 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -330,8 +330,9 @@ TEST_F(TestParquetFileFormat, CachedMetadata) { // Read the file the first time, will read metadata auto options = std::make_shared(); options->filter = literal(true); - ASSERT_OK_AND_ASSIGN(auto projection_descr, - ProjectionDescr::FromNames({"x"}, *test_schema)); + ASSERT_OK_AND_ASSIGN( + auto projection_descr, + ProjectionDescr::FromNames({"x"}, *test_schema, options->add_augmented_fields)); options->projected_schema = projection_descr.schema; options->projection = projection_descr.expression; ASSERT_OK_AND_ASSIGN(auto generator, fragment->ScanBatchesAsync(options)); diff --git a/cpp/src/arrow/dataset/scanner.cc b/cpp/src/arrow/dataset/scanner.cc index 18981d1451980..a856a792a264f 100644 --- a/cpp/src/arrow/dataset/scanner.cc +++ b/cpp/src/arrow/dataset/scanner.cc @@ -211,7 +211,8 @@ Status NormalizeScanOptions(const std::shared_ptr& scan_options, // create the projected schema only if the provided expressions // produces valid set of fields. ARROW_ASSIGN_OR_RAISE(auto projection_descr, - ProjectionDescr::Default(*projected_schema)); + ProjectionDescr::Default( + *projected_schema, scan_options->add_augmented_fields)); scan_options->projected_schema = std::move(projection_descr.schema); scan_options->projection = projection_descr.expression; ARROW_ASSIGN_OR_RAISE(scan_options->projection, @@ -220,7 +221,8 @@ Status NormalizeScanOptions(const std::shared_ptr& scan_options, // if projected_fields are not found, we default to creating the projected_schema // and projection from the dataset_schema. ARROW_ASSIGN_OR_RAISE(auto projection_descr, - ProjectionDescr::Default(*dataset_schema)); + ProjectionDescr::Default( + *dataset_schema, scan_options->add_augmented_fields)); scan_options->projected_schema = std::move(projection_descr.schema); scan_options->projection = projection_descr.expression; } @@ -231,7 +233,7 @@ Status NormalizeScanOptions(const std::shared_ptr& scan_options, ARROW_ASSIGN_OR_RAISE( auto projection_descr, ProjectionDescr::FromNames(scan_options->projected_schema->field_names(), - *dataset_schema)); + *dataset_schema, scan_options->add_augmented_fields)); scan_options->projection = projection_descr.expression; } @@ -730,7 +732,8 @@ Future AsyncScanner::CountRowsAsync(Executor* executor) { const auto options = std::make_shared(*scan_options_); ARROW_ASSIGN_OR_RAISE(auto empty_projection, ProjectionDescr::FromNames(std::vector(), - *scan_options_->dataset_schema)); + *scan_options_->dataset_schema, + scan_options_->add_augmented_fields)); SetProjection(options.get(), empty_projection); auto total = std::make_shared>(0); @@ -828,7 +831,8 @@ Result ProjectionDescr::FromExpressions( } Result ProjectionDescr::FromNames(std::vector names, - const Schema& dataset_schema) { + const Schema& dataset_schema, + bool add_augmented_fields) { std::vector exprs(names.size()); for (size_t i = 0; i < exprs.size(); ++i) { // If name isn't in schema, try finding it by dotted path. @@ -846,15 +850,19 @@ Result ProjectionDescr::FromNames(std::vector name } } auto fields = dataset_schema.fields(); - for (const auto& aug_field : kAugmentedFields) { - fields.push_back(aug_field); + if (add_augmented_fields) { + for (const auto& aug_field : kAugmentedFields) { + fields.push_back(aug_field); + } } return ProjectionDescr::FromExpressions(std::move(exprs), std::move(names), Schema(fields, dataset_schema.metadata())); } -Result ProjectionDescr::Default(const Schema& dataset_schema) { - return ProjectionDescr::FromNames(dataset_schema.field_names(), dataset_schema); +Result ProjectionDescr::Default(const Schema& dataset_schema, + bool add_augmented_fields) { + return ProjectionDescr::FromNames(dataset_schema.field_names(), dataset_schema, + add_augmented_fields); } void SetProjection(ScanOptions* options, ProjectionDescr projection) { @@ -899,7 +907,8 @@ const std::shared_ptr& ScannerBuilder::projected_schema() const { Status ScannerBuilder::Project(std::vector columns) { ARROW_ASSIGN_OR_RAISE( auto projection, - ProjectionDescr::FromNames(std::move(columns), *scan_options_->dataset_schema)); + ProjectionDescr::FromNames(std::move(columns), *scan_options_->dataset_schema, + scan_options_->add_augmented_fields)); SetProjection(scan_options_.get(), std::move(projection)); return Status::OK(); } @@ -1052,8 +1061,10 @@ Result MakeScanNode(acero::ExecPlan* plan, }); auto fields = scan_options->dataset_schema->fields(); - for (const auto& aug_field : kAugmentedFields) { - fields.push_back(aug_field); + if (scan_options->add_augmented_fields) { + for (const auto& aug_field : kAugmentedFields) { + fields.push_back(aug_field); + } } return acero::MakeExecNode( diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 4479158ff20cc..d2de267897180 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -114,6 +114,9 @@ struct ARROW_DS_EXPORT ScanOptions { /// Note: This must be true in order for any readahead to happen bool use_threads = false; + /// If true the scanner will add augmented fields to the output schema. + bool add_augmented_fields = true; + /// Fragment-specific scan options. std::shared_ptr fragment_scan_options; @@ -287,10 +290,12 @@ struct ARROW_DS_EXPORT ProjectionDescr { /// \brief Create a default projection referencing fields in the dataset schema static Result FromNames(std::vector names, - const Schema& dataset_schema); + const Schema& dataset_schema, + bool add_augmented_fields = true); /// \brief Make a projection that projects every field in the dataset schema - static Result Default(const Schema& dataset_schema); + static Result Default(const Schema& dataset_schema, + bool add_augmented_fields = true); }; /// \brief Utility method to set the projection expression and schema diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index fccfc80032d31..58bc9c8c0ea6b 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -1103,7 +1103,8 @@ TEST_P(TestScanner, ProjectionDefaults) { } // If we only specify a projection expression then infer the projected schema // from the projection expression - auto projection_desc = ProjectionDescr::FromNames({"i32"}, *schema_); + auto projection_desc = + ProjectionDescr::FromNames({"i32"}, *schema_, /*add_augmented_fields=*/true); { ARROW_SCOPED_TRACE("User only specifies projection"); options_->projection = projection_desc->expression; @@ -1148,7 +1149,8 @@ TEST_P(TestScanner, ProjectedScanNestedFromNames) { }); ASSERT_OK_AND_ASSIGN(auto descr, ProjectionDescr::FromNames({".struct.i32", "nested.right.f64"}, - *options_->dataset_schema)) + *options_->dataset_schema, + options_->add_augmented_fields)) SetProjection(options_.get(), std::move(descr)); auto batch_in = ConstantArrayGenerator::Zeroes(GetParam().items_per_batch, schema_); auto batch_out = ConstantArrayGenerator::Zeroes( @@ -2106,7 +2108,8 @@ TEST(ScanOptions, TestMaterializedFields) { auto set_projection_from_names = [&opts](std::vector names) { ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::FromNames( - std::move(names), *opts->dataset_schema)); + std::move(names), *opts->dataset_schema, + opts->add_augmented_fields)); SetProjection(opts.get(), std::move(projection)); }; @@ -2160,7 +2163,8 @@ TEST(ScanOptions, TestMaterializedFields) { // project top-level field, filter nothing opts->filter = literal(true); ASSERT_OK_AND_ASSIGN(projection, - ProjectionDescr::FromNames({"nested"}, *opts->dataset_schema)); + ProjectionDescr::FromNames({"nested"}, *opts->dataset_schema, + opts->add_augmented_fields)); SetProjection(opts.get(), std::move(projection)); EXPECT_THAT(opts->MaterializedFields(), ElementsAre(FieldRef("nested"))); diff --git a/cpp/src/arrow/dataset/test_util_internal.h b/cpp/src/arrow/dataset/test_util_internal.h index de0519afac9e1..8195218b0cfe8 100644 --- a/cpp/src/arrow/dataset/test_util_internal.h +++ b/cpp/src/arrow/dataset/test_util_internal.h @@ -386,7 +386,8 @@ class DatasetFixtureMixin : public ::testing::Test { options_ = std::make_shared(); options_->dataset_schema = schema_; ASSERT_OK_AND_ASSIGN(auto projection, - ProjectionDescr::FromNames(schema_->field_names(), *schema_)); + ProjectionDescr::FromNames(schema_->field_names(), *schema_, + options_->add_augmented_fields)); SetProjection(options_.get(), std::move(projection)); SetFilter(literal(true)); } @@ -398,7 +399,8 @@ class DatasetFixtureMixin : public ::testing::Test { void SetProjectedColumns(std::vector column_names) { ASSERT_OK_AND_ASSIGN( auto projection, - ProjectionDescr::FromNames(std::move(column_names), *options_->dataset_schema)); + ProjectionDescr::FromNames(std::move(column_names), *options_->dataset_schema, + /*add_augmented_fields=*/true)); SetProjection(options_.get(), std::move(projection)); } @@ -502,7 +504,8 @@ class FileFormatFixtureMixin : public ::testing::Test { void SetSchema(std::vector> fields) { opts_->dataset_schema = schema(std::move(fields)); ASSERT_OK_AND_ASSIGN(auto projection, - ProjectionDescr::Default(*opts_->dataset_schema)); + ProjectionDescr::Default(*opts_->dataset_schema, + /*add_augmented_fields=*/true)); SetProjection(opts_.get(), std::move(projection)); } @@ -512,7 +515,8 @@ class FileFormatFixtureMixin : public ::testing::Test { void Project(std::vector names) { ASSERT_OK_AND_ASSIGN(auto projection, ProjectionDescr::FromNames( - std::move(names), *opts_->dataset_schema)); + std::move(names), *opts_->dataset_schema, + /*add_augmented_fields=*/true)); SetProjection(opts_.get(), std::move(projection)); } @@ -993,7 +997,8 @@ class FileFormatScanMixin : public FileFormatFixtureMixin, auto i64 = field("i64", int64()); this->opts_->dataset_schema = schema({i32, i32, i64}); ASSERT_RAISES(Invalid, - ProjectionDescr::FromNames({"i32"}, *this->opts_->dataset_schema)); + ProjectionDescr::FromNames({"i32"}, *this->opts_->dataset_schema, + /*add_augmented_fields=*/true)); } void TestScanWithPushdownNulls() { // Regression test for ARROW-15312 @@ -1933,7 +1938,8 @@ class WriteFileSystemDatasetMixin : public MakeFileSystemDatasetMixin { scan_options_->dataset_schema = dataset_->schema(); ASSERT_OK_AND_ASSIGN( auto projection, - ProjectionDescr::FromNames(source_schema_->field_names(), *dataset_->schema())); + ProjectionDescr::FromNames(source_schema_->field_names(), *dataset_->schema(), + scan_options_->add_augmented_fields)); SetProjection(scan_options_.get(), std::move(projection)); } diff --git a/cpp/src/arrow/engine/substrait/relation_internal.cc b/cpp/src/arrow/engine/substrait/relation_internal.cc index f15f1a5527b7b..7c462c418f81b 100644 --- a/cpp/src/arrow/engine/substrait/relation_internal.cc +++ b/cpp/src/arrow/engine/substrait/relation_internal.cc @@ -393,6 +393,7 @@ Result FromProto(const substrait::Rel& rel, const ExtensionSet& auto scan_options = std::make_shared(); scan_options->use_threads = true; + scan_options->add_augmented_fields = false; if (read.has_filter()) { ARROW_ASSIGN_OR_RAISE(scan_options->filter, diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc index 3e80192377937..6762d1e045450 100644 --- a/cpp/src/arrow/engine/substrait/serde_test.cc +++ b/cpp/src/arrow/engine/substrait/serde_test.cc @@ -1064,6 +1064,86 @@ NamedTableProvider AlwaysProvideSameTable(std::shared_ptr table) { }; } +TEST(Substrait, ExecReadRelWithLocalFiles) { + ASSERT_OK_AND_ASSIGN(std::string dir_string, + arrow::internal::GetEnvVar("PARQUET_TEST_DATA")); + + std::string substrait_json = R"({ + "relations": [ + { + "root": { + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "f32", + "f64" + ], + "struct": { + "types": [ + { + "fp32": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fp64": { + "nullability": "NULLABILITY_REQUIRED" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + }, + "localFiles": { + "items": [ + { + "uriFile": "file://[DIRECTORY_PLACEHOLDER]/byte_stream_split.zstd.parquet", + "parquet": {} + } + ] + } + } + }, + "names": [ + "f32", + "f64" + ] + } + } + ], + "version": { + "minorNumber": 42, + "producer": "my-producer" + } + })"; + const char* placeholder = "[DIRECTORY_PLACEHOLDER]"; + substrait_json.replace(substrait_json.find(placeholder), strlen(placeholder), + dir_string); + + ASSERT_OK_AND_ASSIGN(auto buf, + internal::SubstraitFromJSON("Plan", substrait_json, + /*ignore_unknown_fields=*/false)); + + ASSERT_OK_AND_ASSIGN(auto declarations, + DeserializePlans(*buf, acero::NullSinkNodeConsumer::Make)); + ASSERT_EQ(declarations.size(), 1); + acero::Declaration* decl = &declarations[0]; + ASSERT_EQ(decl->factory_name, "consuming_sink"); + ASSERT_OK_AND_ASSIGN(auto plan, acero::ExecPlan::Make()); + ASSERT_OK_AND_ASSIGN(auto sink_node, declarations[0].AddToPlan(plan.get())); + ASSERT_STREQ(sink_node->kind_name(), "ConsumingSinkNode"); + ASSERT_EQ(sink_node->num_inputs(), 1); + auto& prev_node = sink_node->inputs()[0]; + ASSERT_STREQ(prev_node->kind_name(), "SourceNode"); + + plan->StartProducing(); + ASSERT_FINISHES_OK(plan->finished()); +} + TEST(Substrait, RelWithHint) { ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON("Rel", R"({ @@ -2443,6 +2523,7 @@ TEST(SubstraitRoundTrip, BasicPlanEndToEnd) { auto scan_options = std::make_shared(); scan_options->projection = compute::project({}, {}); + scan_options->add_augmented_fields = false; const std::string filter_col_left = "shared"; const std::string filter_col_right = "distinct"; auto comp_left_value = compute::field_ref(filter_col_left); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 667b4e372ae59..b71a5ae73b2e9 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -141,18 +141,14 @@ struct ARROW_EXPORT AzureOptions { /// /// 1. abfs[s]://[:\@]\.blob.core.windows.net /// [/\[/\]] - /// 2. abfs[s]://\[:\]@\.dfs.core.windows.net - /// [/path] + /// 2. abfs[s]://\[:\]\@\.dfs.core.windows.net[/path] /// 3. abfs[s]://[\]@]\[\<:port\>] /// [/\[/path]] /// 4. abfs[s]://[\]@]\[/path] /// - /// 1. and 2. are compatible with the Azure Data Lake Storage Gen2 URIs: - /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri - /// - /// 3. is for Azure Blob Storage compatible service including Azurite. - /// - /// 4. is a shorter version of 1. and 2. + /// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs + /// [1], (3) is for Azure Blob Storage compatible service including Azurite, + /// and (4) is a shorter version of (1) and (2). /// /// Note that there is no difference between abfs and abfss. HTTPS is /// used with abfs by default. You can force to use HTTP by specifying @@ -178,6 +174,9 @@ struct ARROW_EXPORT AzureOptions { /// AzureOptions::ConfigureClientSecretCredential() is called. /// * client_secret: You must specify "tenant_id" and "client_id" /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// + /// [1]: + /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri static Result FromUri(const Uri& uri, std::string* out_path); static Result FromUri(const std::string& uri, std::string* out_path); @@ -226,7 +225,7 @@ struct ARROW_EXPORT AzureOptions { /// overwriting. /// - When you use the ListBlobs operation without specifying a delimiter, the results /// include both directories and blobs. If you choose to use a delimiter, use only a -/// forward slash (/) -- the only supported delimiter. +/// forward slash (/) \--- the only supported delimiter. /// - If you use the DeleteBlob API to delete a directory, that directory is deleted only /// if it's empty. This means that you can't use the Blob API delete directories /// recursively. diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index b79af08385c0c..284be685fa800 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -761,8 +761,8 @@ class FileSystemFactoryRegistry { RETURN_NOT_OK(CheckValid()); auto [it, success] = scheme_to_factory_.emplace( - std::move(scheme), Registered{std::move(factory), std::move(finalizer)}); - if (success) { + std::move(scheme), Registered{factory, std::move(finalizer)}); + if (success || (it->second.ok() && it->second->factory == factory)) { return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/localfs_test.cc b/cpp/src/arrow/filesystem/localfs_test.cc index 1a20e44bc36e2..d68c992dff863 100644 --- a/cpp/src/arrow/filesystem/localfs_test.cc +++ b/cpp/src/arrow/filesystem/localfs_test.cc @@ -154,15 +154,16 @@ TEST(FileSystemFromUri, RuntimeRegisteredFactory) { EXPECT_THAT(FileSystemFromUri("slowfile2:///hey/yo", &path), Raises(StatusCode::Invalid)); - EXPECT_THAT(RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), - Ok()); + EXPECT_THAT( + RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, __FILE__, __LINE__}), + Ok()); ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri("slowfile2:///hey/yo", &path)); EXPECT_EQ(path, "/hey/yo"); EXPECT_EQ(fs->type_name(), "slow"); EXPECT_THAT( - RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, "", 0}), + RegisterFileSystemFactory("slowfile2", {SlowFileSystemFactory, __FILE__, __LINE__}), Raises(StatusCode::KeyError, testing::HasSubstr("Attempted to register factory for scheme 'slowfile2' " "but that scheme is already registered"))); diff --git a/cpp/src/arrow/flight/client.h b/cpp/src/arrow/flight/client.h index 330fa8bad730d..613903108949e 100644 --- a/cpp/src/arrow/flight/client.h +++ b/cpp/src/arrow/flight/client.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -/// \brief Implementation of Flight RPC client. API should be -/// considered experimental for now +/// \brief Implementation of Flight RPC client. #pragma once @@ -177,7 +176,6 @@ class ARROW_FLIGHT_EXPORT FlightMetadataReader { }; /// \brief Client class for Arrow Flight RPC services. -/// API experimental for now class ARROW_FLIGHT_EXPORT FlightClient { public: ~FlightClient(); @@ -275,8 +273,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// \param[in] options Per-RPC options /// \param[in] descriptor the dataset request /// \param[in] listener Callbacks for response and RPC completion - /// - /// This API is EXPERIMENTAL. void GetFlightInfoAsync(const FlightCallOptions& options, const FlightDescriptor& descriptor, std::shared_ptr> listener); @@ -288,8 +284,6 @@ class ARROW_FLIGHT_EXPORT FlightClient { /// \brief Asynchronous GetFlightInfo returning a Future. /// \param[in] options Per-RPC options /// \param[in] descriptor the dataset request - /// - /// This API is EXPERIMENTAL. arrow::Future GetFlightInfoAsync(const FlightCallOptions& options, const FlightDescriptor& descriptor); arrow::Future GetFlightInfoAsync(const FlightDescriptor& descriptor) { diff --git a/cpp/src/arrow/flight/cookie_internal.cc b/cpp/src/arrow/flight/cookie_internal.cc index 8f41106ebce5c..75a10d148bf47 100644 --- a/cpp/src/arrow/flight/cookie_internal.cc +++ b/cpp/src/arrow/flight/cookie_internal.cc @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces for defining middleware for Flight clients. Currently -// experimental. +// Interfaces for defining middleware for Flight clients. #include "arrow/flight/cookie_internal.h" #include "arrow/flight/client.h" diff --git a/cpp/src/arrow/flight/middleware.h b/cpp/src/arrow/flight/middleware.h index 84448097ff019..d717e396a8b68 100644 --- a/cpp/src/arrow/flight/middleware.h +++ b/cpp/src/arrow/flight/middleware.h @@ -16,7 +16,7 @@ // under the License. // Interfaces for defining middleware for Flight clients and -// servers. Currently experimental. +// servers. #pragma once diff --git a/cpp/src/arrow/flight/server.h b/cpp/src/arrow/flight/server.h index ffcffe12e3c78..8d73353ab16c1 100644 --- a/cpp/src/arrow/flight/server.h +++ b/cpp/src/arrow/flight/server.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces to use for defining Flight RPC servers. API should be considered -// experimental for now +// Interfaces to use for defining Flight RPC servers. #pragma once diff --git a/cpp/src/arrow/flight/server_middleware.h b/cpp/src/arrow/flight/server_middleware.h index 030f1a17c2100..3a3e6f8616ed6 100644 --- a/cpp/src/arrow/flight/server_middleware.h +++ b/cpp/src/arrow/flight/server_middleware.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces for defining middleware for Flight servers. Currently -// experimental. +// Interfaces for defining middleware for Flight servers. #pragma once diff --git a/cpp/src/arrow/flight/sql/server.cc b/cpp/src/arrow/flight/sql/server.cc index cae3542b4faf8..63d1f5c5225fa 100644 --- a/cpp/src/arrow/flight/sql/server.cc +++ b/cpp/src/arrow/flight/sql/server.cc @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces to use for defining Flight RPC servers. API should be considered -// experimental for now +// Interfaces to use for defining Flight RPC servers. // Platform-specific defines #include "arrow/flight/platform.h" diff --git a/cpp/src/arrow/flight/sql/server.h b/cpp/src/arrow/flight/sql/server.h index 7b5d71678f3de..7130e96987b89 100644 --- a/cpp/src/arrow/flight/sql/server.h +++ b/cpp/src/arrow/flight/sql/server.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Interfaces to use for defining Flight RPC servers. API should be considered -// experimental for now +// Interfaces to use for defining Flight RPC servers. #pragma once diff --git a/cpp/src/arrow/flight/sql/server_session_middleware.h b/cpp/src/arrow/flight/sql/server_session_middleware.h index 021793de3de32..6eb11041a08bd 100644 --- a/cpp/src/arrow/flight/sql/server_session_middleware.h +++ b/cpp/src/arrow/flight/sql/server_session_middleware.h @@ -16,7 +16,6 @@ // under the License. // Middleware for handling Flight SQL Sessions including session cookie handling. -// Currently experimental. #pragma once diff --git a/cpp/src/arrow/flight/transport.h b/cpp/src/arrow/flight/transport.h index 4029aa5223deb..4ce50534023fc 100644 --- a/cpp/src/arrow/flight/transport.h +++ b/cpp/src/arrow/flight/transport.h @@ -19,8 +19,6 @@ /// Internal (but not private) interface for implementing /// alternate network transports in Flight. /// -/// \warning EXPERIMENTAL. Subject to change. -/// /// To implement a transport, implement ServerTransport and /// ClientTransport, and register the desired URI schemes with /// TransportRegistry. Flight takes care of most of the per-RPC @@ -248,8 +246,6 @@ TransportRegistry* GetDefaultTransportRegistry(); /// Transport implementations may subclass this to store their own /// state, and stash an instance in a user-supplied AsyncListener via /// ClientTransport::GetAsyncRpc and ClientTransport::SetAsyncRpc. -/// -/// This API is EXPERIMENTAL. class ARROW_FLIGHT_EXPORT AsyncRpc { public: virtual ~AsyncRpc() = default; diff --git a/cpp/src/arrow/flight/types.h b/cpp/src/arrow/flight/types.h index b3df8377b8ffd..cdf03f21041ee 100644 --- a/cpp/src/arrow/flight/types.h +++ b/cpp/src/arrow/flight/types.h @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Data structure for Flight RPC. API should be considered experimental for now +// Data structure for Flight RPC. #pragma once @@ -1115,8 +1115,6 @@ std::string ToString(TransportStatusCode code); /// instead of trying to translate to Arrow Status. /// /// Currently, only attached to the Status passed to AsyncListener::OnFinish. -/// -/// This API is EXPERIMENTAL. class ARROW_FLIGHT_EXPORT TransportStatusDetail : public StatusDetail { public: constexpr static const char* kTypeId = "flight::TransportStatusDetail"; diff --git a/cpp/src/arrow/flight/types_async.h b/cpp/src/arrow/flight/types_async.h index a241e64fb4e49..d5ed48d8a6438 100644 --- a/cpp/src/arrow/flight/types_async.h +++ b/cpp/src/arrow/flight/types_async.h @@ -31,8 +31,6 @@ namespace arrow::flight { /// @{ /// \brief Non-templated state for an async RPC. -/// -/// This API is EXPERIMENTAL. class ARROW_FLIGHT_EXPORT AsyncListenerBase { public: AsyncListenerBase(); @@ -57,8 +55,6 @@ class ARROW_FLIGHT_EXPORT AsyncListenerBase { /// A single listener may not be used for multiple concurrent RPC /// calls. The application MUST hold the listener alive until /// OnFinish() is called and has finished. -/// -/// This API is EXPERIMENTAL. template class ARROW_FLIGHT_EXPORT AsyncListener : public AsyncListenerBase { public: diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 8e8d3903663e4..7d8084e17c279 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -563,15 +563,17 @@ Status Scalar::ValidateFull() const { BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} -void BinaryScalar::FillScratchSpace() { +void BinaryScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->size()) : int32_t(0)}); } -void BinaryViewScalar::FillScratchSpace() { +void BinaryViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { static_assert(sizeof(BinaryViewType::c_type) <= internal::kScalarScratchSpaceSize); - auto* view = new (&scratch_space_) BinaryViewType::c_type; + auto* view = new (scratch_space) BinaryViewType::c_type; if (value) { *view = util::ToBinaryView(std::string_view{*value}, 0, 0); } else { @@ -579,9 +581,10 @@ void BinaryViewScalar::FillScratchSpace() { } } -void LargeBinaryScalar::FillScratchSpace() { +void LargeBinaryScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int64_t(0), value ? static_cast(value->size()) : int64_t(0)}); } @@ -612,36 +615,40 @@ BaseListScalar::BaseListScalar(std::shared_ptr value, } ListScalar::ListScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, list(value->type()), is_valid) {} + : ListScalar(value, list(value->type()), is_valid) {} -void ListScalar::FillScratchSpace() { +void ListScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListScalar::LargeListScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, large_list(value->type()), is_valid) {} + : LargeListScalar(value, large_list(value->type()), is_valid) {} -void LargeListScalar::FillScratchSpace() { - FillScalarScratchSpace(scratch_space_, +void LargeListScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { + FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } ListViewScalar::ListViewScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, list_view(value->type()), is_valid) {} + : ListViewScalar(value, list_view(value->type()), is_valid) {} -void ListViewScalar::FillScratchSpace() { +void ListViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } LargeListViewScalar::LargeListViewScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, large_list_view(value->type()), is_valid) {} + : LargeListViewScalar(value, large_list_view(value->type()), is_valid) {} -void LargeListViewScalar::FillScratchSpace() { - FillScalarScratchSpace(scratch_space_, +void LargeListViewScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { + FillScalarScratchSpace(scratch_space, {int64_t(0), value ? value->length() : int64_t(0)}); } @@ -652,11 +659,12 @@ inline std::shared_ptr MakeMapType(const std::shared_ptr& pa } MapScalar::MapScalar(std::shared_ptr value, bool is_valid) - : BaseListScalar(value, MakeMapType(value->type()), is_valid) {} + : MapScalar(value, MakeMapType(value->type()), is_valid) {} -void MapScalar::FillScratchSpace() { +void MapScalar::FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value) { FillScalarScratchSpace( - scratch_space_, + scratch_space, {int32_t(0), value ? static_cast(value->length()) : int32_t(0)}); } @@ -705,7 +713,9 @@ Result> StructScalar::field(FieldRef ref) const { RunEndEncodedScalar::RunEndEncodedScalar(std::shared_ptr value, std::shared_ptr type) - : Scalar{std::move(type), value->is_valid}, value{std::move(value)} { + : Scalar{std::move(type), value->is_valid}, + ArraySpanFillFromScalarScratchSpace(*this->type), + value{std::move(value)} { ARROW_CHECK_EQ(this->type->id(), Type::RUN_END_ENCODED); } @@ -716,18 +726,18 @@ RunEndEncodedScalar::RunEndEncodedScalar(const std::shared_ptr& type) RunEndEncodedScalar::~RunEndEncodedScalar() = default; -void RunEndEncodedScalar::FillScratchSpace() { - auto run_end = run_end_type()->id(); +void RunEndEncodedScalar::FillScratchSpace(uint8_t* scratch_space, const DataType& type) { + Type::type run_end = checked_cast(type).run_end_type()->id(); switch (run_end) { case Type::INT16: - FillScalarScratchSpace(scratch_space_, {int16_t(1)}); + FillScalarScratchSpace(scratch_space, {int16_t(1)}); break; case Type::INT32: - FillScalarScratchSpace(scratch_space_, {int32_t(1)}); + FillScalarScratchSpace(scratch_space, {int32_t(1)}); break; default: DCHECK_EQ(run_end, Type::INT64); - FillScalarScratchSpace(scratch_space_, {int64_t(1)}); + FillScalarScratchSpace(scratch_space, {int64_t(1)}); } } @@ -806,6 +816,7 @@ Result TimestampScalar::FromISO8601(std::string_view iso8601, SparseUnionScalar::SparseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, /*is_valid=*/true), + ArraySpanFillFromScalarScratchSpace(type_code), value(std::move(value)) { const auto child_ids = checked_cast(*this->type).child_ids(); if (type_code >= 0 && static_cast(type_code) < child_ids.size() && @@ -833,13 +844,13 @@ std::shared_ptr SparseUnionScalar::FromValue(std::shared_ptr val return std::make_shared(field_values, type_code, std::move(type)); } -void SparseUnionScalar::FillScratchSpace() { - auto* union_scratch_space = reinterpret_cast(&scratch_space_); +void SparseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { + auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; } -void DenseUnionScalar::FillScratchSpace() { - auto* union_scratch_space = reinterpret_cast(&scratch_space_); +void DenseUnionScalar::FillScratchSpace(uint8_t* scratch_space, int8_t type_code) { + auto* union_scratch_space = reinterpret_cast(scratch_space); union_scratch_space->type_code = type_code; FillScalarScratchSpace(union_scratch_space->offsets, {int32_t(0), int32_t(1)}); } diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index a7ee6a417d9a1..982a4c5113c92 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -141,7 +141,12 @@ struct ARROW_EXPORT ArraySpanFillFromScalarScratchSpace { alignas(int64_t) mutable uint8_t scratch_space_[kScalarScratchSpaceSize]; private: - ArraySpanFillFromScalarScratchSpace() { static_cast(this)->FillScratchSpace(); } + template + explicit ArraySpanFillFromScalarScratchSpace(Args&&... args) { + Impl::FillScratchSpace(scratch_space_, std::forward(args)...); + } + + ArraySpanFillFromScalarScratchSpace() = delete; friend Impl; }; @@ -278,20 +283,32 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase { struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit BinaryScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} + explicit BinaryScalar(std::string s) : BinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -312,23 +329,35 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryViewType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit BinaryViewScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryViewScalar(std::shared_ptr value, std::shared_ptr type) + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + BinaryViewScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit BinaryViewScalar(std::shared_ptr value) : BinaryViewScalar(std::move(value), binary_view()) {} explicit BinaryViewScalar(std::string s) - : BaseBinaryScalar(std::move(s), binary_view()) {} + : BinaryViewScalar(std::move(s), binary_view()) {} BinaryViewScalar() : BinaryViewScalar(binary_view()) {} std::string_view view() const override { return std::string_view(*this->value); } private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -350,24 +379,33 @@ struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar, private internal::ArraySpanFillFromScalarScratchSpace { - using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + explicit LargeBinaryScalar(std::shared_ptr type) + : BaseBinaryScalar(std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + LargeBinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} + : BaseBinaryScalar(std::move(value), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} + + LargeBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(std::move(s), std::move(type)), + ArraySpanFillFromScalarScratchSpace(this->value) {} explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} explicit LargeBinaryScalar(std::string s) - : BaseBinaryScalar(std::move(s), large_binary()) {} + : LargeBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -550,14 +588,19 @@ struct ARROW_EXPORT ListScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + ListScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit ListScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -567,14 +610,19 @@ struct ARROW_EXPORT LargeListScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + LargeListScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit LargeListScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -584,14 +632,19 @@ struct ARROW_EXPORT ListViewScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = ListViewType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + ListViewScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit ListViewScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -601,14 +654,19 @@ struct ARROW_EXPORT LargeListViewScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = LargeListViewType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + LargeListViewScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit LargeListViewScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -618,14 +676,19 @@ struct ARROW_EXPORT MapScalar : public BaseListScalar, private internal::ArraySpanFillFromScalarScratchSpace { using TypeClass = MapType; - using BaseListScalar::BaseListScalar; using ArraySpanFillFromScalarScratchSpace = internal::ArraySpanFillFromScalarScratchSpace; + MapScalar(std::shared_ptr value, std::shared_ptr type, + bool is_valid = true) + : BaseListScalar(std::move(value), std::move(type), is_valid), + ArraySpanFillFromScalarScratchSpace(this->value) {} + explicit MapScalar(std::shared_ptr value, bool is_valid = true); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, + const std::shared_ptr& value); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -707,7 +770,7 @@ struct ARROW_EXPORT SparseUnionScalar std::shared_ptr type); private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -733,10 +796,11 @@ struct ARROW_EXPORT DenseUnionScalar DenseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr type) : UnionScalar(std::move(type), type_code, value->is_valid), + ArraySpanFillFromScalarScratchSpace(type_code), value(std::move(value)) {} private: - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; @@ -772,7 +836,7 @@ struct ARROW_EXPORT RunEndEncodedScalar private: const TypeClass& ree_type() const { return internal::checked_cast(*type); } - void FillScratchSpace(); + static void FillScratchSpace(uint8_t* scratch_space, const DataType& type); friend ArraySpan; friend ArraySpanFillFromScalarScratchSpace; diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 967e78f6b4db1..5dc5e4c1a9a8c 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -619,6 +619,7 @@ TableBatchReader::TableBatchReader(const Table& table) for (int i = 0; i < table.num_columns(); ++i) { column_data_[i] = table.column(i).get(); } + DCHECK(table_.Validate().ok()); } TableBatchReader::TableBatchReader(std::shared_ptr
table) @@ -632,6 +633,7 @@ TableBatchReader::TableBatchReader(std::shared_ptr
table) for (int i = 0; i < owned_table_->num_columns(); ++i) { column_data_[i] = owned_table_->column(i).get(); } + DCHECK(table_.Validate().ok()); } std::shared_ptr TableBatchReader::schema() const { return table_.schema(); } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index a7508430c132b..79675fa92b1f3 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -241,6 +241,8 @@ class ARROW_EXPORT Table { /// /// The conversion is zero-copy: each record batch is a view over a slice /// of the table's columns. +/// +/// The table is expected to be valid prior to using it with the batch reader. class ARROW_EXPORT TableBatchReader : public RecordBatchReader { public: /// \brief Construct a TableBatchReader for the given table diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index e26efba28594b..087e4e3879e56 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -56,6 +56,7 @@ add_arrow_test(utility-test compression_test.cc decimal_test.cc float16_test.cc + fixed_width_test.cc formatting_util_test.cc key_value_metadata_test.cc hashing_test.cc diff --git a/cpp/src/arrow/util/fixed_width_internal.cc b/cpp/src/arrow/util/fixed_width_internal.cc new file mode 100644 index 0000000000000..3f12fafb54f0f --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_internal.cc @@ -0,0 +1,232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array/data.h" +#include "arrow/compute/kernel.h" +#include "arrow/result.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/fixed_width_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" + +namespace arrow::util { + +using ::arrow::internal::checked_cast; + +bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, + bool exclude_bool_and_dictionary) { + return IsFixedWidthLike( + source, force_null_count, [exclude_bool_and_dictionary](const DataType& type) { + return !exclude_bool_and_dictionary || + (type.id() != Type::DICTIONARY && type.id() != Type::BOOL); + }); +} + +static int64_t FixedWidthInBytesFallback(const FixedSizeListType& fixed_size_list_type) { + auto* fsl = &fixed_size_list_type; + int64_t list_size = fsl->list_size(); + for (auto type = fsl->value_type().get();;) { + if (type->id() == Type::FIXED_SIZE_LIST) { + fsl = checked_cast(type); + list_size *= fsl->list_size(); + type = fsl->value_type().get(); + continue; + } + if (type->id() != Type::BOOL && is_fixed_width(type->id())) { + const int64_t flat_byte_width = list_size * type->byte_width(); + DCHECK_GE(flat_byte_width, 0); + return flat_byte_width; + } + break; + } + return -1; +} + +int64_t FixedWidthInBytes(const DataType& type) { + auto type_id = type.id(); + if (is_fixed_width(type_id)) { + const int32_t num_bits = type.bit_width(); + return (type_id == Type::BOOL) ? -1 : num_bits / 8; + } + if (type_id == Type::FIXED_SIZE_LIST) { + auto& fsl = ::arrow::internal::checked_cast(type); + return FixedWidthInBytesFallback(fsl); + } + return -1; +} + +static int64_t FixedWidthInBitsFallback(const FixedSizeListType& fixed_size_list_type) { + auto* fsl = &fixed_size_list_type; + int64_t list_size = fsl->list_size(); + for (auto type = fsl->value_type().get();;) { + auto type_id = type->id(); + if (type_id == Type::FIXED_SIZE_LIST) { + fsl = checked_cast(type); + list_size *= fsl->list_size(); + type = fsl->value_type().get(); + continue; + } + if (is_fixed_width(type_id)) { + const int64_t flat_bit_width = list_size * type->bit_width(); + DCHECK_GE(flat_bit_width, 0); + return flat_bit_width; + } + break; + } + return -1; +} + +int64_t FixedWidthInBits(const DataType& type) { + auto type_id = type.id(); + if (is_fixed_width(type_id)) { + return type.bit_width(); + } + if (type_id == Type::FIXED_SIZE_LIST) { + auto& fsl = ::arrow::internal::checked_cast(type); + return FixedWidthInBitsFallback(fsl); + } + return -1; +} + +namespace internal { + +Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, + int64_t length, const ArraySpan& source, + bool allocate_validity, ArrayData* out) { + DCHECK(!source.MayHaveNulls() || allocate_validity) + << "allocate_validity cannot be false if source may have nulls"; + DCHECK_EQ(source.type->id(), out->type->id()); + auto* type = source.type; + out->length = length; + if (type->id() == Type::FIXED_SIZE_LIST) { + out->buffers.resize(1); + out->child_data = {std::make_shared()}; + } else { + out->buffers.resize(2); + } + if (allocate_validity) { + ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length)); + } + + if (type->id() == Type::BOOL) { + ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length)); + return Status::OK(); + } + if (is_fixed_width(type->id())) { + if (type->id() == Type::DICTIONARY) { + return Status::NotImplemented( + "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); + } + ARROW_ASSIGN_OR_RAISE(out->buffers[1], + ctx->Allocate(length * source.type->byte_width())); + return Status::OK(); + } + if (type->id() == Type::FIXED_SIZE_LIST) { + auto& fsl_type = checked_cast(*type); + auto& value_type = fsl_type.value_type(); + if (ARROW_PREDICT_FALSE(value_type->id() == Type::DICTIONARY)) { + return Status::NotImplemented( + "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); + } + if (source.child_data[0].MayHaveNulls()) { + return Status::Invalid( + "PreallocateFixedWidthArrayData: " + "FixedSizeList may have null values in child array: ", + fsl_type); + } + auto* child_values = out->child_data[0].get(); + child_values->type = value_type; + return PreallocateFixedWidthArrayData(ctx, length * fsl_type.list_size(), + /*source=*/source.child_data[0], + /*allocate_validity=*/false, + /*out=*/child_values); + } + return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", *type); +} + +} // namespace internal + +std::pair OffsetPointerOfFixedBitWidthValues( + const ArraySpan& source) { + using OffsetAndListSize = std::pair; + auto get_offset = [](auto pair) { return pair.first; }; + auto get_list_size = [](auto pair) { return pair.second; }; + ::arrow::internal::SmallVector stack; + + int64_t list_size = 1; + auto* array = &source; + while (array->type->id() == Type::FIXED_SIZE_LIST) { + list_size *= checked_cast(array->type)->list_size(); + stack.emplace_back(array->offset, list_size); + array = &array->child_data[0]; + } + // Now that innermost values were reached, pop the stack and calculate the offset + // in bytes of the innermost values buffer by considering the offset at each + // level of nesting. + DCHECK(is_fixed_width(*array->type)); + DCHECK(array == &source || !array->MayHaveNulls()) + << "OffsetPointerOfFixedWidthValues: array is expected to be flat or have no " + "nulls in the arrays nested by FIXED_SIZE_LIST."; + int64_t value_width_in_bits = array->type->bit_width(); + int64_t offset_in_bits = array->offset * value_width_in_bits; + for (auto it = stack.rbegin(); it != stack.rend(); ++it) { + value_width_in_bits *= get_list_size(*it); + offset_in_bits += get_offset(*it) * value_width_in_bits; + } + DCHECK_GE(value_width_in_bits, 0); + const auto* values_ptr = array->GetValues(1, 0); + return {static_cast(offset_in_bits % 8), values_ptr + (offset_in_bits / 8)}; +} + +const uint8_t* OffsetPointerOfFixedByteWidthValues(const ArraySpan& source) { + DCHECK(IsFixedWidthLike(source, /*force_null_count=*/false, + [](const DataType& type) { return type.id() != Type::BOOL; })); + return OffsetPointerOfFixedBitWidthValues(source).second; +} + +/// \brief Get the mutable pointer to the fixed-width values of an array +/// allocated by PreallocateFixedWidthArrayData. +/// +/// \pre mutable_array->offset and the offset of child array (if it's a +/// FixedSizeList) MUST be 0 (recursively). +/// \pre IsFixedWidthLike(ArraySpan(mutable_array)) or the more restrictive +/// is_fixed_width(*mutable_array->type) MUST be true +/// \return The mutable pointer to the fixed-width byte blocks of the array. If +/// pre-conditions are not satisfied, the return values is undefined. +uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array) { + auto* array = mutable_array; + auto type_id = array->type->id(); + while (type_id == Type::FIXED_SIZE_LIST) { + DCHECK_EQ(array->offset, 0); + DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of " + << mutable_array->type->ToString(true); + array = array->child_data[0].get(); + type_id = array->type->id(); + } + DCHECK_EQ(mutable_array->offset, 0); + // BOOL is allowed here only because the offset is expected to be 0, + // so the byte-aligned pointer also points to the first *bit* of the buffer. + DCHECK(is_fixed_width(type_id)); + return array->GetMutableValues(1, 0); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_internal.h b/cpp/src/arrow/util/fixed_width_internal.h new file mode 100644 index 0000000000000..232411f4c4a56 --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_internal.h @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/array/data.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" + +namespace arrow::compute { +// XXX: remove dependency on compute::KernelContext +class KernelContext; +} // namespace arrow::compute + +namespace arrow::util { + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// Fixed-width types are the ones defined by the is_fixed_width() predicate in +/// type_traits.h. They are all the types that passes any of the following +/// predicates: +/// +/// - is_primitive() +/// - is_fixed_size_binary() +/// - is_dictionary() +/// +/// At least 3 types in this set require special care: +/// - `Type::BOOL` is fixed-width, but it's a 1-bit type and pointers to first bit +/// in boolean buffers are not always aligned to byte boundaries. +/// - `Type::DICTIONARY` is fixed-width because the indices are fixed-width, but the +/// dictionary values are not necessarily fixed-width and have to be managed +/// by separate operations. +/// - Type::FIXED_SIZE_BINARY unlike other fixed-width types, fixed-size binary +/// values are defined by a size attribute that is not known at compile time. +/// The other types have power-of-2 byte widths, while fixed-size binary can +/// have any byte width including 0. +/// +/// Additionally, we say that a type is "fixed-width like" if it's a fixed-width as +/// defined above, or if it's a fixed-size list (or nested fixed-size lists) and +/// the innermost type is fixed-width and the following restrictions also apply: +/// - Only the top-level array may have nulls, all the inner array have to be completely +/// free of nulls so we don't need to manage internal validity bitmaps. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +/// \param exclude_bool_and_dictionary If true, BOOL and DICTIONARY are excluded from +/// the is_fixed_width() types. Default: false. +ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count = false, + bool exclude_bool_and_dictionary = false); + +// Take the following `fixed_size_list, 3>` array as an +// example: +// +// [ +// [[1, 2], [3, 4], [ 5, 6]], +// null, +// [[7, 8], [9, 10], [11, 12]] +// ] +// +// in memory, it would look like: +// +// { +// type: fixed_size_list, 3>, +// length: 3, +// null_count: 1, +// offset: 0, +// buffers: [ +// 0: [0b00000101] +// ], +// child_data: [ +// 0: { +// type: fixed_size_list, +// length: 9, +// null_count: 0, +// offset: 0, +// buffers: [0: NULL], +// child_data: [ +// 0: { +// type: int32, +// length: 18, +// null_count: 0, +// offset: 0, +// buffers: [ +// 0: NULL, +// 1: [ 1, 2, 3, 4, 5, 6, +// 0, 0, 0, 0, 0, 0 +// 7, 8, 9, 10, 11, 12 ] +// ], +// child_data: [] +// } +// ] +// } +// ] +// } +// +// This layout fits the fixed-width like definition because the innermost type +// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't +// have nulls. The validity bitmap is only needed at the top-level array. +// +// Writing to this array can be done in the same way writing to a flat fixed-width +// array is done, by: +// 1. Updating the validity bitmap at the top-level array if nulls are present. +// 2. Updating a continuous fixed-width block of memory through a single pointer. +// +// The length of this block of memory is the product of the list sizes in the +// `FixedSizeList` types and the byte width of the innermost fixed-width type: +// +// 3 * 2 * 4 = 24 bytes +// +// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by +// simply setting the validity bit to 1 and writing the 24-byte sequence of +// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. +// +// The length of the top-level array fully defines the lengths that all the nested +// arrays must have, which makes defining all the lengths as easy as defining the +// length of the top-level array. +// +// length = 3 +// child_data[0].length == 3 * 3 == 9 +// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 +// +// child_data[0].child_data[0].buffers[1].size() >= +// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) +// +// Dealing with offsets is a bit involved. Let's say the array described above has +// the offsets 2, 5, and 7: +// +// { +// type: fixed_size_list, 3>, +// offset: 2, +// ... +// child_data: [ +// 0: { +// type: fixed_size_list, +// offset: 5, +// ... +// child_data: [ +// 0: { +// type: int32, +// offset: 7, +// buffers: [ +// 0: NULL, +// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped +// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped +// +// 0,0,0,0,0,1, // +// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped +// +// 1, 2, 3, 4, 5, 6, // +// 0, 0, 0, 0, 0, 0 // the actual values +// 7, 8, 9, 10, 11, 12 // +// ] +// ], +// } +// ] +// } +// ] +// } +// +// The offset of the innermost values buffer, in bytes, is calculated as: +// +// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes +// +// In general, the formula to calculate the offset of the innermost values buffer is: +// +// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) +// * sizeof(innermost_type) +// +// `OffsetPointerOfFixedByteWidthValues()` can calculate this byte offset and return +// the pointer to the first relevant byte of the innermost values buffer. + +/// \brief Checks if the given array has a fixed-width type or if it's an array of +/// fixed-size list that can be flattened to an array of fixed-width values. +/// +/// \param source The array to check +/// \param force_null_count If true, GetNullCount() is used instead of null_count +/// \param extra_predicate A DataType predicate that can be used to further +/// restrict the types that are considered fixed-width +template +inline bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, + ExtraPred extra_predicate) { + const auto* type = source.type; + // BOOL is considered fixed-width if not nested under FIXED_SIZE_LIST. + if (is_fixed_width(type->id()) && extra_predicate(*type)) { + return true; + } + if (type->id() == Type::FIXED_SIZE_LIST) { + // All the inner arrays must not contain any nulls. + const auto* values = &source.child_data[0]; + while ((force_null_count ? values->GetNullCount() : values->null_count) == 0) { + type = values->type; + if (type->id() == Type::FIXED_SIZE_LIST) { + values = &values->child_data[0]; + continue; + } + return is_fixed_width(type->id()) && extra_predicate(*type); + } + } + return false; +} + +/// \brief Get the fixed-width in bytes of a type if it is a fixed-width like +/// type, but not BOOL. +/// +/// If the array is a FixedSizeList (of any level of nesting), the byte width of +/// the values is the product of all fixed-list sizes and the byte width of the +/// innermost fixed-width value type. +/// +/// IsFixedWidthLike(array) performs more checks than this function and should +/// be used to guarantee that, if type is not BOOL, this function will not return -1. +/// +/// NOTE: this function translates `DataType::bit_width()` to bytes differently from +/// `DataType::byte_width()`. `DataType::byte_width()` will return 0 for +/// BOOL, while this function will return `-1`. This is done because 0 is +/// a valid return value for FIXED_SIZE_LIST with size 0 or `FIXED_SIZE_BINARY` with +/// size 0. +/// +/// \pre The instance of the array where this type is from must pass +/// `IsFixedWidthLike(array)` and should not be BOOL. +/// \return The fixed-byte width of the values or -1 if the type is BOOL or not +/// fixed-width like. 0 is a valid return value as fixed-size-lists +/// and fixed-size-binary with size 0 are allowed. +ARROW_EXPORT int64_t FixedWidthInBytes(const DataType& type); + +/// \brief Get the fixed-width in bits of a type if it is a fixed-width like +/// type. +/// +/// If the array is a FixedSizeList (of any level of nesting), the bit width of +/// the values is the product of all fixed-list sizes and the bit width of the +/// innermost fixed-width value type. +/// +/// \return The bit-width of the values or -1 +/// \see FixedWidthInBytes +ARROW_EXPORT int64_t FixedWidthInBits(const DataType& type); + +namespace internal { + +/// \brief Allocate an ArrayData for a type that is fixed-width like. +/// +/// This function performs the same checks performed by +/// `IsFixedWidthLike(source, false, false)`. If `source.type` is not a simple +/// fixed-width type, caller should make sure it passes the +/// `IsFixedWidthLike(source)` checks. That guarantees that it's possible to +/// allocate an array that can serve as a destination for a kernel that writes values +/// through a single pointer to fixed-width byte blocks. +/// +/// \param[in] length The length of the array to allocate (unrelated to the length of +/// the source array) +/// \param[in] source The source array that carries the type information and the +/// validity bitmaps that are relevant for the type validation +/// when the source is a FixedSizeList. +/// \see IsFixedWidthLike +ARROW_EXPORT Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, + int64_t length, + const ArraySpan& source, + bool allocate_validity, + ArrayData* out); + +} // namespace internal + +/// \brief Get the 0-7 residual offset in bits and the pointer to the fixed-width +/// values of a fixed-width like array. +/// +/// For byte-aligned types, the offset is always 0. +/// +/// \pre `IsFixedWidthLike(source)` or the more restrictive +/// is_fixed_width(*mutable_array->type) SHOULD be true +/// \return A pair with the residual offset in bits (0-7) and the pointer +/// to the fixed-width values. +ARROW_EXPORT std::pair OffsetPointerOfFixedBitWidthValues( + const ArraySpan& source); + +/// \brief Get the pointer to the fixed-width values of a fixed-width like array. +/// +/// \pre `IsFixedWidthLike(source)` should be true and BOOL should be excluded +/// as each bool is 1-bit width making it impossible to produce a +/// byte-aligned pointer to the values in the general case. +ARROW_EXPORT const uint8_t* OffsetPointerOfFixedByteWidthValues(const ArraySpan& source); + +/// \brief Get the mutable pointer to the fixed-width values of an array +/// allocated by PreallocateFixedWidthArrayData. +/// +/// \pre mutable_array->offset and the offset of child array (if it's a +/// FixedSizeList) MUST be 0 (recursively). +/// \pre IsFixedWidthLike(ArraySpan(mutable_array)) or the more restrictive +/// is_fixed_width(*mutable_array->type) MUST be true +/// \return The mutable pointer to the fixed-width byte blocks of the array. If +/// pre-conditions are not satisfied, the return values is undefined. +ARROW_EXPORT uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array); + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_test.cc b/cpp/src/arrow/util/fixed_width_test.cc new file mode 100644 index 0000000000000..3b35de1b6bbeb --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_test.cc @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// #include +// #include + +#include + +#include "arrow/array/array_base.h" +#include "arrow/array/data.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/fixed_width_internal.h" + +namespace arrow::util { + +namespace { +bool NotBool(const DataType& type) { return type.id() != Type::BOOL; } +bool NotInt32(const DataType& type) { return type.id() != Type::INT32; } +} // namespace + +class TestFixedWidth : public ::testing::Test { + protected: + std::shared_ptr bool_array_array_; + std::shared_ptr int_array_array_; + std::shared_ptr fsl_bool_array_; + std::shared_ptr fsl_int_array_; + std::shared_ptr fsl_int_nulls_array_; + std::shared_ptr fsl_int_inner_nulls_array_; + std::shared_ptr dict_string_array_; + + std::shared_ptr fsl(int32_t list_size, + const std::shared_ptr& value_type) { + return fixed_size_list(value_type, list_size); + } + + public: + void SetUp() override { + bool_array_array_ = ArrayFromJSON(boolean(), "[true, false, null]"); + int_array_array_ = ArrayFromJSON(int32(), "[1, 0, null]"); + fsl_bool_array_ = ArrayFromJSON(fsl(2, boolean()), "[[true, false]]"); + fsl_int_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3]]"); + fsl_int_nulls_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], null, [1, 2]]"); + fsl_int_inner_nulls_array_ = + ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3], [null, 2]]"); + dict_string_array_ = + ArrayFromJSON(dictionary(int32(), utf8()), R"(["Alice", "Bob", "Alice"])"); + } +}; + +TEST_F(TestFixedWidth, IsFixedWidth) { + auto arr = ArraySpan{*bool_array_array_->data()}; + // force_null_count doesn't matter because nulls at the top-level + // of the array are allowed by IsFixedWidthLike. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotInt32)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool)); + + arr = ArraySpan{*int_array_array_->data()}; + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool)); +} + +TEST_F(TestFixedWidth, IsFixedWidthLike) { + auto arr = ArraySpan{*fsl_bool_array_->data()}; + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + + arr = ArraySpan{*fsl_int_array_->data()}; + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + arr.null_count = kUnknownNullCount; + // force_null_count=true isn't necessary because nulls at the top-level + // of the array are allowed by IsFixedWidthLike. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + + arr.child_data[0].null_count = kUnknownNullCount; + // inner nulls are not allowed by IsFixedWidthLike... + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + // ...but forcing null counting at on every internal array increases + // the chances of IsFixedWidthLike returning true. + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + // Excluding INT32 from the internal array checks. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true, NotInt32)); + + arr = ArraySpan{*fsl_int_nulls_array_->data()}; + // Nulls at the top-level of the array are allowed by IsFixedWidthLike. + // + // TODO(GH-10157): ArrayFromJSON uses FixedSizeListBuilder which currently + // produces nulls on the child data if one of the list-typed elements is null. + // ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); + + arr = ArraySpan{*fsl_int_inner_nulls_array_->data()}; + // Inner nulls are not allowed by IsFixedWidthLike. + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + + arr = ArraySpan{*dict_string_array_->data()}; + // Dictionaries are considered fixed-width by is_fixed_width(), but excluded + // by IsFixedWidthLike if exclude_bool_and_dictionary=true. + ASSERT_TRUE(IsFixedWidthLike(arr)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, + /*exclude_bool_and_dictionary=*/false)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false, + /*exclude_bool_and_dictionary=*/true)); +} + +TEST_F(TestFixedWidth, MeasureWidthInBytes) { + auto b = boolean(); + auto i8 = int8(); + auto i32 = int32(); + auto fsb = fixed_size_binary(3); + auto dict = dictionary(int32(), utf8()); + auto varlen = utf8(); + ASSERT_EQ(FixedWidthInBytes(*b), -1); + ASSERT_EQ(FixedWidthInBytes(*i8), 1); + ASSERT_EQ(FixedWidthInBytes(*i32), 4); + ASSERT_EQ(FixedWidthInBytes(*fsb), 3); + ASSERT_EQ(FixedWidthInBytes(*dict), 4); + + ASSERT_EQ(FixedWidthInBytes(*varlen), -1); + ASSERT_EQ(FixedWidthInBytes(*varlen), -1); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, b)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, b)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, b)), -1); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, i8)), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, i8)), 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, i8)), 5); + ASSERT_EQ(FixedWidthInBytes(*fsl(0, i32)), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(3, i32)), 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, i32)), 5 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, fsb)), 5 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(5, dict)), 5 * 4); + + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i8))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i8))), 2 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i8))), 2 * 5); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i32))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i32))), 2 * 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i32))), 2 * 5 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, fsb))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, fsb))), 2 * 3 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, fsb))), 2 * 5 * 3); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, dict))), 0); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, dict))), 2 * 3 * 4); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, dict))), 2 * 5 * 4); + + ASSERT_EQ(FixedWidthInBytes(*fsl(0, varlen)), -1); + ASSERT_EQ(FixedWidthInBytes(*fsl(2, varlen)), -1); +} + +TEST_F(TestFixedWidth, MeasureWidthInBits) { + auto b = boolean(); + auto i8 = int8(); + auto i32 = int32(); + auto fsb = fixed_size_binary(3); + auto dict = dictionary(int32(), utf8()); + auto varlen = utf8(); + ASSERT_EQ(FixedWidthInBits(*b), 1); + ASSERT_EQ(FixedWidthInBits(*i8), 8); + ASSERT_EQ(FixedWidthInBits(*i32), 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsb), 3 * 8); + ASSERT_EQ(FixedWidthInBits(*dict), 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*varlen), -1); + ASSERT_EQ(FixedWidthInBits(*varlen), -1); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), 3); + ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), 5); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, i8)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, i8)), 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, i8)), 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(0, i32)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, i32)), 4 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, i32)), 4 * 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, fsb)), 5 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(5, dict)), 5 * 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i8))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i8))), 2 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i8))), 2 * 5 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i32))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i32))), 2 * 3 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i32))), 2 * 5 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, fsb))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, fsb))), 2 * 3 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, fsb))), 2 * 5 * 3 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, dict))), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, dict))), 2 * 3 * 4 * 8); + ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, dict))), 2 * 5 * 4 * 8); + + ASSERT_EQ(FixedWidthInBits(*fsl(0, varlen)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(2, varlen)), -1); +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_test_util.h b/cpp/src/arrow/util/fixed_width_test_util.h new file mode 100644 index 0000000000000..ca141b7ca2c4d --- /dev/null +++ b/cpp/src/arrow/util/fixed_width_test_util.h @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include "arrow/array/builder_primitive.h" +#include "arrow/builder.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow::util::internal { + +class NestedListGenerator { + public: + /// \brief Create a nested FixedSizeListType. + /// + /// \return `fixed_size_list(fixed_size_list(..., sizes[1]), sizes[0])` + static std::shared_ptr NestedFSLType( + const std::shared_ptr& inner_type, const std::vector& sizes) { + auto type = inner_type; + for (auto it = sizes.rbegin(); it != sizes.rend(); it++) { + type = fixed_size_list(std::move(type), *it); + } + return type; + } + + /// \brief Create a nested FixedListType. + /// + /// \return `list(list(...))` + static std::shared_ptr NestedListType( + const std::shared_ptr& inner_type, size_t depth) { + auto list_type = list(inner_type); + for (size_t i = 1; i < depth; i++) { + list_type = list(std::move(list_type)); + } + return list_type; + } + + private: + template + static Status AppendNumeric(ArrayBuilder* builder, int64_t* next_value) { + using NumericBuilder = ::arrow::NumericBuilder; + using value_type = typename NumericBuilder::value_type; + auto* numeric_builder = ::arrow::internal::checked_cast(builder); + auto cast_next_value = + static_cast(*next_value % std::numeric_limits::max()); + RETURN_NOT_OK(numeric_builder->Append(cast_next_value)); + *next_value += 1; + return Status::OK(); + } + + // Append([...[[*next_inner_value++, *next_inner_value++, ...]]...]) + static Status AppendNestedList(ArrayBuilder* nested_builder, const int* list_sizes, + int64_t* next_inner_value) { + using ::arrow::internal::checked_cast; + ArrayBuilder* builder = nested_builder; + auto type = builder->type(); + if (type->id() == Type::FIXED_SIZE_LIST || type->id() == Type::LIST) { + const int list_size = *list_sizes; + if (type->id() == Type::FIXED_SIZE_LIST) { + auto* fsl_builder = checked_cast(builder); + assert(list_size == checked_cast(*type).list_size()); + RETURN_NOT_OK(fsl_builder->Append()); + builder = fsl_builder->value_builder(); + } else { // type->id() == Type::LIST) + auto* list_builder = checked_cast(builder); + RETURN_NOT_OK(list_builder->Append(/*is_valid=*/true, list_size)); + builder = list_builder->value_builder(); + } + list_sizes++; + for (int i = 0; i < list_size; i++) { + RETURN_NOT_OK(AppendNestedList(builder, list_sizes, next_inner_value)); + } + } else { + switch (type->id()) { + case Type::INT8: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT16: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT32: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + case Type::INT64: + RETURN_NOT_OK(AppendNumeric(builder, next_inner_value)); + break; + default: + return Status::NotImplemented("Unsupported type: ", *type); + } + } + return Status::OK(); + } + + static Result> NestedListArray( + ArrayBuilder* nested_builder, const std::vector& list_sizes, int64_t length) { + int64_t next_inner_value = 0; + for (int64_t i = 0; i < length; i++) { + RETURN_NOT_OK( + AppendNestedList(nested_builder, list_sizes.data(), &next_inner_value)); + } + return nested_builder->Finish(); + } + + public: + static Result> NestedFSLArray( + const std::shared_ptr& inner_type, const std::vector& list_sizes, + int64_t length) { + auto nested_type = NestedFSLType(inner_type, list_sizes); + ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type)); + return NestedListArray(builder.get(), list_sizes, length); + } + + static Result> NestedListArray( + const std::shared_ptr& inner_type, const std::vector& list_sizes, + int64_t length) { + auto nested_type = NestedListType(inner_type, list_sizes.size()); + ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type)); + return NestedListArray(builder.get(), list_sizes, length); + } + + /// \brief Generate all possible nested list configurations of depth 1 to max_depth. + /// + /// Each configuration consists of a single inner value type and a list of sizes. + /// Both can be used with NestedFSLArray and NestedListArray to generate test data. + /// + /// The product of the list sizes and the size of the inner value type is always a power + /// of 2 no greater than max_power_of_2_size. For max_depth=3 and + /// max_power_of_2_size=32, this generates 108 configurations. + /// + /// \tparam Visit a function type with signature + /// void(const std::shared_ptr& inner_type, + /// const std::vector& list_sizes) + template + static void VisitAllNestedListConfigurations( + const std::vector>& inner_value_types, Visit&& visit, + int max_depth = 3, int max_power_of_2_size = 32) { + for (int depth = 1; depth <= max_depth; depth++) { + for (auto& type : inner_value_types) { + assert(is_fixed_width(*type)); + int value_width = type->byte_width(); + + std::vector list_sizes; // stack of list sizes + auto pop = [&]() { // pop the list_sizes stack + assert(!list_sizes.empty()); + value_width /= list_sizes.back(); + list_sizes.pop_back(); + }; + auto next = [&]() { // double the top of the stack + assert(!list_sizes.empty()); + value_width *= 2; + list_sizes.back() *= 2; + return value_width; + }; + auto push_1s = [&]() { // fill the stack with 1s + while (list_sizes.size() < static_cast(depth)) { + list_sizes.push_back(1); + } + }; + + // Loop invariants: + // value_width == product(list_sizes) * type->byte_width() + // value_width is a power-of-2 (1, 2, 4, 8, 16, max_power_of_2_size=32) + push_1s(); + do { + // for (auto x : list_sizes) printf("%d * ", x); + // printf("(%s) %d = %2d\n", type->name().c_str(), type->byte_width(), + // value_width); + visit(type, list_sizes); + // Advance to the next test case + while (!list_sizes.empty()) { + if (next() <= max_power_of_2_size) { + push_1s(); + break; + } + pop(); + } + } while (!list_sizes.empty()); + } + } + } +}; + +} // namespace arrow::util::internal diff --git a/cpp/src/gandiva/cache.cc b/cpp/src/gandiva/cache.cc index a1333ccdc5d43..2358b08c82424 100644 --- a/cpp/src/gandiva/cache.cc +++ b/cpp/src/gandiva/cache.cc @@ -20,26 +20,41 @@ #include "arrow/result.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" namespace gandiva { -static const size_t DEFAULT_CACHE_SIZE = 5000; - -int GetCapacity() { - size_t capacity = DEFAULT_CACHE_SIZE; - auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE"); - if (maybe_env_cache_size.ok()) { - const auto env_cache_size = *std::move(maybe_env_cache_size); - if (!env_cache_size.empty()) { - capacity = std::atol(env_cache_size.c_str()); - if (capacity <= 0) { - ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. " - << "Using default cache size: " << DEFAULT_CACHE_SIZE; - capacity = DEFAULT_CACHE_SIZE; - } - } +constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE"; +constexpr auto kDefaultCacheSize = 5000; + +namespace internal { +int GetCacheCapacityFromEnvVar() { + auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar); + if (!maybe_env_value.ok()) { + return kDefaultCacheSize; + } + const auto env_value = *std::move(maybe_env_value); + if (env_value.empty()) { + return kDefaultCacheSize; + } + int capacity = 0; + bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>( + env_value.c_str(), env_value.size(), &capacity); + if (!ok || capacity <= 0) { + ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar + << ". Using default cache size: " << kDefaultCacheSize; + return kDefaultCacheSize; } - return static_cast(capacity); + return capacity; +} +} // namespace internal + +// Deprecated in 17.0.0. Use GetCacheCapacity instead. +int GetCapacity() { return GetCacheCapacity(); } + +int GetCacheCapacity() { + static const int capacity = internal::GetCacheCapacityFromEnvVar(); + return capacity; } void LogCacheSize(size_t capacity) { diff --git a/cpp/src/gandiva/cache.h b/cpp/src/gandiva/cache.h index 7cff9b02692ae..c19dbb7a0e30e 100644 --- a/cpp/src/gandiva/cache.h +++ b/cpp/src/gandiva/cache.h @@ -20,14 +20,27 @@ #include #include +#include "arrow/util/macros.h" #include "gandiva/lru_cache.h" #include "gandiva/visibility.h" namespace gandiva { +namespace internal { +// Only called once by GetCacheCapacity(). +// Do the actual work of getting the cache capacity from env var. +// Also makes the testing easier. +GANDIVA_EXPORT +int GetCacheCapacityFromEnvVar(); +} // namespace internal + +ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.") GANDIVA_EXPORT int GetCapacity(); +GANDIVA_EXPORT +int GetCacheCapacity(); + GANDIVA_EXPORT void LogCacheSize(size_t capacity); @@ -36,7 +49,7 @@ class Cache { public: explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); } - Cache() : Cache(GetCapacity()) {} + Cache() : Cache(GetCacheCapacity()) {} ValueType GetObjectCode(const KeyType& cache_key) { std::optional result; diff --git a/cpp/src/gandiva/cache_test.cc b/cpp/src/gandiva/cache_test.cc index a146707079fa6..96cf4a12e587a 100644 --- a/cpp/src/gandiva/cache_test.cc +++ b/cpp/src/gandiva/cache_test.cc @@ -16,10 +16,14 @@ // under the License. #include "gandiva/cache.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" #include namespace gandiva { + class TestCacheKey { public: explicit TestCacheKey(int value) : value_(value) {} @@ -38,5 +42,67 @@ TEST(TestCache, TestGetPut) { ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world"); } -TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); } +namespace { +constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE"; +constexpr auto default_cache_capacity = 5000; +} // namespace + +TEST(TestCache, TestGetCacheCapacityDefault) { + ASSERT_EQ(GetCacheCapacity(), default_cache_capacity); +} + +TEST(TestCache, TestGetCacheCapacityEnvVar) { + using ::arrow::EnvVarGuard; + + // Empty. + { + EnvVarGuard guard(cache_capacity_env_var, ""); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Non-number. + { + EnvVarGuard guard(cache_capacity_env_var, "invalid"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Number with invalid suffix. + { + EnvVarGuard guard(cache_capacity_env_var, "42MB"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Valid positive number. + { + EnvVarGuard guard(cache_capacity_env_var, "42"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42); + } + + // Int max. + { + auto str = std::to_string(std::numeric_limits::max()); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits::max()); + } + + // Zero. + { + EnvVarGuard guard(cache_capacity_env_var, "0"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Negative number. + { + EnvVarGuard guard(cache_capacity_env_var, "-1"); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } + + // Over int max. + { + auto str = std::to_string(static_cast(std::numeric_limits::max()) + 1); + EnvVarGuard guard(cache_capacity_env_var, str.c_str()); + ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity); + } +} + } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 5aa0eb38eafd7..3849cf7bdf9a5 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1377,7 +1377,7 @@ gdv_int32 ascii_utf8(const char* data, gdv_int32 data_len) { if (data_len == 0) { return 0; } - return static_cast(data[0]); + return static_cast(static_cast(data[0])); } // Returns the ASCII character having the binary equivalent to A. diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 89213592e7ea2..aaa25db0a9f8d 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -51,6 +51,8 @@ TEST(TestStringOps, TestAscii) { EXPECT_EQ(ascii_utf8("", 0), 0); EXPECT_EQ(ascii_utf8("123", 3), 49); EXPECT_EQ(ascii_utf8("999", 3), 57); + EXPECT_EQ(ascii_utf8("\x80", 1), -128); + EXPECT_EQ(ascii_utf8("\xFF", 1), -1); } TEST(TestStringOps, TestChrBigInt) { diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index d6ad7c25bc7c1..285e2a597389d 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -1043,6 +1043,16 @@ Status FileReaderImpl::GetRecordBatchReader(const std::vector& row_groups, } } + // Check all columns has same row-size + if (!columns.empty()) { + int64_t row_size = columns[0]->length(); + for (size_t i = 1; i < columns.size(); ++i) { + if (columns[i]->length() != row_size) { + return ::arrow::Status::Invalid("columns do not have the same size"); + } + } + } + auto table = ::arrow::Table::Make(batch_schema, std::move(columns)); auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index eae7ac4252735..a4794c564733a 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1675,44 +1675,55 @@ class TypedRecordReader : public TypedColumnReaderImpl, // // \return Number of records delimited int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) { - int64_t values_to_read = 0; + if (ARROW_PREDICT_FALSE(num_records == 0 || levels_position_ == levels_written_)) { + *values_seen = 0; + return 0; + } int64_t records_read = 0; - - const int16_t* def_levels = this->def_levels() + levels_position_; - const int16_t* rep_levels = this->rep_levels() + levels_position_; - + const int16_t* const rep_levels = this->rep_levels(); + const int16_t* const def_levels = this->def_levels(); ARROW_DCHECK_GT(this->max_rep_level_, 0); - - // Count logical records and number of values to read - while (levels_position_ < levels_written_) { - const int16_t rep_level = *rep_levels++; - if (rep_level == 0) { - // If at_record_start_ is true, we are seeing the start of a record - // for the second time, such as after repeated calls to - // DelimitRecords. In this case we must continue until we find - // another record start or exhausting the ColumnChunk - if (!at_record_start_) { - // We've reached the end of a record; increment the record count. - ++records_read; - if (records_read == num_records) { - // We've found the number of records we were looking for. Set - // at_record_start_ to true and break - at_record_start_ = true; - break; - } - } - } + // If at_record_start_ is true, we are seeing the start of a record + // for the second time, such as after repeated calls to + // DelimitRecords. In this case we must continue until we find + // another record start or exhausting the ColumnChunk + int64_t level = levels_position_; + if (at_record_start_) { + ARROW_DCHECK_EQ(0, rep_levels[levels_position_]); + ++levels_position_; // We have decided to consume the level at this position; therefore we // must advance until we find another record boundary at_record_start_ = false; + } - const int16_t def_level = *def_levels++; - if (def_level == this->max_def_level_) { - ++values_to_read; + // Count logical records and number of non-null values to read + ARROW_DCHECK(!at_record_start_); + // Scan repetition levels to find record end + while (levels_position_ < levels_written_) { + // We use an estimated batch size to simplify branching and + // improve performance in the common case. This might slow + // things down a bit if a single long record remains, though. + int64_t stride = + std::min(levels_written_ - levels_position_, num_records - records_read); + const int64_t position_end = levels_position_ + stride; + for (int64_t i = levels_position_; i < position_end; ++i) { + records_read += rep_levels[i] == 0; + } + levels_position_ = position_end; + if (records_read == num_records) { + // Check last rep_level reaches the boundary and + // pop the last level. + ARROW_CHECK_EQ(rep_levels[levels_position_ - 1], 0); + --levels_position_; + // We've found the number of records we were looking for. Set + // at_record_start_ to true and break + at_record_start_ = true; + break; } - ++levels_position_; } - *values_seen = values_to_read; + // Scan definition levels to find number of physical values + *values_seen = std::count(def_levels + level, def_levels + levels_position_, + this->max_def_level_); return records_read; } diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc index 3da5c64ace5dd..004cb746b3a89 100644 --- a/cpp/src/parquet/encoding.cc +++ b/cpp/src/parquet/encoding.cc @@ -2740,13 +2740,12 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, : EncoderImpl(descr, Encoding::DELTA_LENGTH_BYTE_ARRAY, pool = ::arrow::default_memory_pool()), sink_(pool), - length_encoder_(nullptr, pool), - encoded_size_{0} {} + length_encoder_(nullptr, pool) {} std::shared_ptr FlushValues() override; int64_t EstimatedDataEncodedSize() override { - return encoded_size_ + length_encoder_.EstimatedDataEncodedSize(); + return sink_.length() + length_encoder_.EstimatedDataEncodedSize(); } using TypedEncoder::Put; @@ -2768,6 +2767,11 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, return Status::Invalid( "Parquet cannot store strings with size 2GB or more, got: ", view.size()); } + if (ARROW_PREDICT_FALSE( + view.size() + sink_.length() > + static_cast(std::numeric_limits::max()))) { + return Status::Invalid("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); + } length_encoder_.Put({static_cast(view.length())}, 1); PARQUET_THROW_NOT_OK(sink_.Append(view.data(), view.length())); return Status::OK(); @@ -2777,7 +2781,6 @@ class DeltaLengthByteArrayEncoder : public EncoderImpl, ::arrow::BufferBuilder sink_; DeltaBitPackEncoder length_encoder_; - uint32_t encoded_size_; }; template @@ -2803,15 +2806,15 @@ void DeltaLengthByteArrayEncoder::Put(const T* src, int num_values) { const int batch_size = std::min(kBatchSize, num_values - idx); for (int j = 0; j < batch_size; ++j) { const int32_t len = src[idx + j].len; - if (AddWithOverflow(total_increment_size, len, &total_increment_size)) { + if (ARROW_PREDICT_FALSE( + AddWithOverflow(total_increment_size, len, &total_increment_size))) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } lengths[j] = len; } length_encoder_.Put(lengths.data(), batch_size); } - - if (AddWithOverflow(encoded_size_, total_increment_size, &encoded_size_)) { + if (sink_.length() + total_increment_size > std::numeric_limits::max()) { throw ParquetException("excess expansion in DELTA_LENGTH_BYTE_ARRAY"); } PARQUET_THROW_NOT_OK(sink_.Reserve(total_increment_size)); @@ -2850,7 +2853,6 @@ std::shared_ptr DeltaLengthByteArrayEncoder::FlushValues() { std::shared_ptr buffer; PARQUET_THROW_NOT_OK(sink_.Finish(&buffer, true)); - encoded_size_ = 0; return buffer; } @@ -3694,12 +3696,24 @@ class ByteStreamSplitDecoderBase : public DecoderImpl, ByteStreamSplitDecoderBase(const ColumnDescriptor* descr, int byte_width) : DecoderImpl(descr, Encoding::BYTE_STREAM_SPLIT), byte_width_(byte_width) {} - void SetData(int num_values, const uint8_t* data, int len) override { - if (static_cast(num_values) * byte_width_ != len) { - throw ParquetException("Data size (" + std::to_string(len) + - ") does not match number of values in BYTE_STREAM_SPLIT (" + - std::to_string(num_values) + ")"); + void SetData(int num_values, const uint8_t* data, int len) final { + // Check that the data size is consistent with the number of values + // The spec requires that the data size is a multiple of the number of values, + // see: https://github.com/apache/parquet-format/pull/192 . + // GH-41562: passed in `num_values` may include nulls, so we need to check and + // adjust the number of values. + if (static_cast(num_values) * byte_width_ < len) { + throw ParquetException( + "Data size (" + std::to_string(len) + + ") is too small for the number of values in in BYTE_STREAM_SPLIT (" + + std::to_string(num_values) + ")"); + } + if (len % byte_width_ != 0) { + throw ParquetException("ByteStreamSplit data size " + std::to_string(len) + + " not aligned with type " + TypeToString(DType::type_num) + + " and byte_width: " + std::to_string(byte_width_)); } + num_values = len / byte_width_; DecoderImpl::SetData(num_values, data, len); stride_ = num_values_; } diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h index 602009189595e..493c4044ddc1c 100644 --- a/cpp/src/parquet/encoding.h +++ b/cpp/src/parquet/encoding.h @@ -255,6 +255,11 @@ class Decoder { // Sets the data for a new page. This will be called multiple times on the same // decoder and should reset all internal state. + // + // `num_values` comes from the data page header, and may be greater than the number of + // physical values in the data buffer if there are some omitted (null) values. + // `len`, on the other hand, is the size in bytes of the data buffer and + // directly relates to the number of physical values. virtual void SetData(int num_values, const uint8_t* data, int len) = 0; // Returns the number of values left (for the last call to SetData()). This is diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc index b91fcb0839cba..78bf26587e3fb 100644 --- a/cpp/src/parquet/encoding_test.cc +++ b/cpp/src/parquet/encoding_test.cc @@ -577,6 +577,11 @@ TEST(PlainEncodingAdHoc, ArrowBinaryDirectPut) { auto decoder = MakeTypedDecoder(Encoding::PLAIN); ASSERT_NO_THROW(encoder->Put(*values)); + // For Plain encoding, the estimated size should be at least the total byte size + auto& string_array = dynamic_cast(*values); + EXPECT_GE(encoder->EstimatedDataEncodedSize(), string_array.total_values_length()) + << "Estimated size should be at least the total byte size"; + auto buf = encoder->FlushValues(); int num_values = static_cast(values->length() - values->null_count()); @@ -1383,7 +1388,7 @@ class TestByteStreamSplitEncoding : public TestEncodingBase { encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); encode_buffer_ = encoder->FlushValues(); ASSERT_EQ(encode_buffer_->size(), physical_byte_width() * (num_values_ - null_count)); - decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, valid_bits, valid_bits_offset); @@ -1717,7 +1722,7 @@ class TestDeltaBitPackEncoding : public TestEncodingBase { for (size_t i = 0; i < kNumRoundTrips; ++i) { encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset); encode_buffer_ = encoder->FlushValues(); - decoder->SetData(num_values_ - null_count, encode_buffer_->data(), + decoder->SetData(num_values_, encode_buffer_->data(), static_cast(encode_buffer_->size())); auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_, null_count, valid_bits, valid_bits_offset); @@ -2160,6 +2165,10 @@ TEST(DeltaLengthByteArrayEncodingAdHoc, ArrowBinaryDirectPut) { auto CheckSeed = [&](std::shared_ptr<::arrow::Array> values) { ASSERT_NO_THROW(encoder->Put(*values)); + auto* binary_array = checked_cast(values.get()); + // For DeltaLength encoding, the estimated size should be at least the total byte size + EXPECT_GE(encoder->EstimatedDataEncodedSize(), binary_array->total_values_length()) + << "Estimated size should be at least the total byte size"; auto buf = encoder->FlushValues(); int num_values = static_cast(values->length() - values->null_count()); diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index b3dd1d6054ac8..8fcb0870ce4b6 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -215,16 +215,14 @@ class SerializedRowGroup : public RowGroupReader::Contents { std::shared_ptr<::arrow::io::internal::ReadRangeCache> cached_source, int64_t source_size, FileMetaData* file_metadata, int row_group_number, ReaderProperties props, - std::shared_ptr prebuffered_column_chunks_bitmap, - std::shared_ptr file_decryptor = nullptr) + std::shared_ptr prebuffered_column_chunks_bitmap) : source_(std::move(source)), cached_source_(std::move(cached_source)), source_size_(source_size), file_metadata_(file_metadata), properties_(std::move(props)), row_group_ordinal_(row_group_number), - prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)), - file_decryptor_(std::move(file_decryptor)) { + prebuffered_column_chunks_bitmap_(std::move(prebuffered_column_chunks_bitmap)) { row_group_metadata_ = file_metadata->RowGroup(row_group_number); } @@ -263,10 +261,10 @@ class SerializedRowGroup : public RowGroupReader::Contents { } // The column is encrypted - std::shared_ptr meta_decryptor = - GetColumnMetaDecryptor(crypto_metadata.get(), file_decryptor_.get()); - std::shared_ptr data_decryptor = - GetColumnDataDecryptor(crypto_metadata.get(), file_decryptor_.get()); + std::shared_ptr meta_decryptor = GetColumnMetaDecryptor( + crypto_metadata.get(), file_metadata_->file_decryptor().get()); + std::shared_ptr data_decryptor = GetColumnDataDecryptor( + crypto_metadata.get(), file_metadata_->file_decryptor().get()); ARROW_DCHECK_NE(meta_decryptor, nullptr); ARROW_DCHECK_NE(data_decryptor, nullptr); @@ -291,7 +289,6 @@ class SerializedRowGroup : public RowGroupReader::Contents { ReaderProperties properties_; int row_group_ordinal_; const std::shared_ptr prebuffered_column_chunks_bitmap_; - std::shared_ptr file_decryptor_; }; // ---------------------------------------------------------------------- @@ -316,7 +313,9 @@ class SerializedFile : public ParquetFileReader::Contents { } void Close() override { - if (file_decryptor_) file_decryptor_->WipeOutDecryptionKeys(); + if (file_metadata_ && file_metadata_->file_decryptor()) { + file_metadata_->file_decryptor()->WipeOutDecryptionKeys(); + } } std::shared_ptr GetRowGroup(int i) override { @@ -330,7 +329,7 @@ class SerializedFile : public ParquetFileReader::Contents { std::unique_ptr contents = std::make_unique( source_, cached_source_, source_size_, file_metadata_.get(), i, properties_, - std::move(prebuffered_column_chunks_bitmap), file_decryptor_); + std::move(prebuffered_column_chunks_bitmap)); return std::make_shared(std::move(contents)); } @@ -346,8 +345,9 @@ class SerializedFile : public ParquetFileReader::Contents { "forget to call ParquetFileReader::Open() first?"); } if (!page_index_reader_) { - page_index_reader_ = PageIndexReader::Make(source_.get(), file_metadata_, - properties_, file_decryptor_.get()); + page_index_reader_ = + PageIndexReader::Make(source_.get(), file_metadata_, properties_, + file_metadata_->file_decryptor().get()); } return page_index_reader_; } @@ -362,8 +362,8 @@ class SerializedFile : public ParquetFileReader::Contents { "forget to call ParquetFileReader::Open() first?"); } if (!bloom_filter_reader_) { - bloom_filter_reader_ = - BloomFilterReader::Make(source_, file_metadata_, properties_, file_decryptor_); + bloom_filter_reader_ = BloomFilterReader::Make(source_, file_metadata_, properties_, + file_metadata_->file_decryptor()); if (bloom_filter_reader_ == nullptr) { throw ParquetException("Cannot create BloomFilterReader"); } @@ -441,10 +441,12 @@ class SerializedFile : public ParquetFileReader::Contents { // Parse the footer depending on encryption type const bool is_encrypted_footer = memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0; + std::shared_ptr file_decryptor; if (is_encrypted_footer) { // Encrypted file with Encrypted footer. const std::pair read_size = - ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len); + ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len, + &file_decryptor); // Read the actual footer metadata_start = read_size.first; metadata_len = read_size.second; @@ -453,8 +455,8 @@ class SerializedFile : public ParquetFileReader::Contents { // Fall through } - const uint32_t read_metadata_len = - ParseUnencryptedFileMetadata(metadata_buffer, metadata_len); + const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( + metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties().get(); if (is_encrypted_footer) { // Nothing else to do here. @@ -550,34 +552,37 @@ class SerializedFile : public ParquetFileReader::Contents { // Parse the footer depending on encryption type const bool is_encrypted_footer = memcmp(footer_buffer->data() + footer_read_size - 4, kParquetEMagic, 4) == 0; + std::shared_ptr file_decryptor; if (is_encrypted_footer) { // Encrypted file with Encrypted footer. std::pair read_size; BEGIN_PARQUET_CATCH_EXCEPTIONS - read_size = - ParseMetaDataOfEncryptedFileWithEncryptedFooter(metadata_buffer, metadata_len); + read_size = ParseMetaDataOfEncryptedFileWithEncryptedFooter( + metadata_buffer, metadata_len, &file_decryptor); END_PARQUET_CATCH_EXCEPTIONS // Read the actual footer int64_t metadata_start = read_size.first; metadata_len = read_size.second; return source_->ReadAsync(metadata_start, metadata_len) - .Then([this, metadata_len, is_encrypted_footer]( + .Then([this, metadata_len, is_encrypted_footer, file_decryptor]( const std::shared_ptr<::arrow::Buffer>& metadata_buffer) { // Continue and read the file footer - return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer); + return ParseMetaDataFinal(metadata_buffer, metadata_len, is_encrypted_footer, + file_decryptor); }); } return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, - is_encrypted_footer); + is_encrypted_footer, std::move(file_decryptor)); } // Continuation - ::arrow::Status ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer, - uint32_t metadata_len, - const bool is_encrypted_footer) { + ::arrow::Status ParseMetaDataFinal( + std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len, + const bool is_encrypted_footer, + std::shared_ptr file_decryptor) { BEGIN_PARQUET_CATCH_EXCEPTIONS - const uint32_t read_metadata_len = - ParseUnencryptedFileMetadata(metadata_buffer, metadata_len); + const uint32_t read_metadata_len = ParseUnencryptedFileMetadata( + metadata_buffer, metadata_len, std::move(file_decryptor)); auto file_decryption_properties = properties_.file_decryption_properties().get(); if (is_encrypted_footer) { // Nothing else to do here. @@ -608,11 +613,11 @@ class SerializedFile : public ParquetFileReader::Contents { // Maps row group ordinal and prebuffer status of its column chunks in the form of a // bitmap buffer. std::unordered_map> prebuffered_column_chunks_; - std::shared_ptr file_decryptor_; // \return The true length of the metadata in bytes - uint32_t ParseUnencryptedFileMetadata(const std::shared_ptr& footer_buffer, - const uint32_t metadata_len); + uint32_t ParseUnencryptedFileMetadata( + const std::shared_ptr& footer_buffer, const uint32_t metadata_len, + std::shared_ptr file_decryptor); std::string HandleAadPrefix(FileDecryptionProperties* file_decryption_properties, EncryptionAlgorithm& algo); @@ -624,11 +629,13 @@ class SerializedFile : public ParquetFileReader::Contents { // \return The position and size of the actual footer std::pair ParseMetaDataOfEncryptedFileWithEncryptedFooter( - const std::shared_ptr& crypto_metadata_buffer, uint32_t footer_len); + const std::shared_ptr& crypto_metadata_buffer, uint32_t footer_len, + std::shared_ptr* file_decryptor); }; uint32_t SerializedFile::ParseUnencryptedFileMetadata( - const std::shared_ptr& metadata_buffer, const uint32_t metadata_len) { + const std::shared_ptr& metadata_buffer, const uint32_t metadata_len, + std::shared_ptr file_decryptor) { if (metadata_buffer->size() != metadata_len) { throw ParquetException("Failed reading metadata buffer (requested " + std::to_string(metadata_len) + " bytes but got " + @@ -637,7 +644,7 @@ uint32_t SerializedFile::ParseUnencryptedFileMetadata( uint32_t read_metadata_len = metadata_len; // The encrypted read path falls through to here, so pass in the decryptor file_metadata_ = FileMetaData::Make(metadata_buffer->data(), &read_metadata_len, - properties_, file_decryptor_); + properties_, std::move(file_decryptor)); return read_metadata_len; } @@ -645,7 +652,7 @@ std::pair SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( const std::shared_ptr<::arrow::Buffer>& crypto_metadata_buffer, // both metadata & crypto metadata length - const uint32_t footer_len) { + const uint32_t footer_len, std::shared_ptr* file_decryptor) { // encryption with encrypted footer // Check if the footer_buffer contains the entire metadata if (crypto_metadata_buffer->size() != footer_len) { @@ -664,7 +671,7 @@ SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter( // Handle AAD prefix EncryptionAlgorithm algo = file_crypto_metadata->encryption_algorithm(); std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_ = std::make_shared( + *file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_crypto_metadata->key_metadata(), properties_.memory_pool()); @@ -683,12 +690,12 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter( EncryptionAlgorithm algo = file_metadata_->encryption_algorithm(); // Handle AAD prefix std::string file_aad = HandleAadPrefix(file_decryption_properties, algo); - file_decryptor_ = std::make_shared( + auto file_decryptor = std::make_shared( file_decryption_properties, file_aad, algo.algorithm, file_metadata_->footer_signing_key_metadata(), properties_.memory_pool()); // set the InternalFileDecryptor in the metadata as well, as it's used // for signature verification and for ColumnChunkMetaData creation. - file_metadata_->set_file_decryptor(file_decryptor_); + file_metadata_->set_file_decryptor(std::move(file_decryptor)); if (file_decryption_properties->check_plaintext_footer_integrity()) { if (metadata_len - read_metadata_len != diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 3f101b5ae3ac6..b24883cdc160b 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -826,6 +826,10 @@ class FileMetaData::FileMetaDataImpl { file_decryptor_ = std::move(file_decryptor); } + const std::shared_ptr& file_decryptor() const { + return file_decryptor_; + } + private: friend FileMetaDataBuilder; uint32_t metadata_len_ = 0; @@ -947,6 +951,10 @@ void FileMetaData::set_file_decryptor( impl_->set_file_decryptor(std::move(file_decryptor)); } +const std::shared_ptr& FileMetaData::file_decryptor() const { + return impl_->file_decryptor(); +} + ParquetVersion::type FileMetaData::version() const { switch (impl_->version()) { case 1: diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 640b898024346..9fc30df58e0d3 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -399,12 +399,14 @@ class PARQUET_EXPORT FileMetaData { private: friend FileMetaDataBuilder; friend class SerializedFile; + friend class SerializedRowGroup; explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len, const ReaderProperties& properties, std::shared_ptr file_decryptor = NULLPTR); void set_file_decryptor(std::shared_ptr file_decryptor); + const std::shared_ptr& file_decryptor() const; // PIMPL Idiom FileMetaData(); diff --git a/csharp/Directory.Build.props b/csharp/Directory.Build.props index f6d42241f95cf..3c06d3cd31d90 100644 --- a/csharp/Directory.Build.props +++ b/csharp/Directory.Build.props @@ -37,12 +37,13 @@ latest true $(CSharpDir)ApacheArrow.snk + true The Apache Software Foundation - https://www.apache.org/images/feather.png + feather.png LICENSE.txt https://arrow.apache.org/ @@ -55,6 +56,7 @@ + diff --git a/csharp/feather.png b/csharp/feather.png new file mode 100644 index 0000000000000..7b596e6683ddb Binary files /dev/null and b/csharp/feather.png differ diff --git a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj index 0ce8c89bb1d1b..c34d880f90060 100644 --- a/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj +++ b/csharp/src/Apache.Arrow.Compression/Apache.Arrow.Compression.csproj @@ -13,7 +13,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj index 9a3cf190cc376..7314b8207fef6 100644 --- a/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj +++ b/csharp/src/Apache.Arrow.Flight.Sql/Apache.Arrow.Flight.Sql.csproj @@ -5,7 +5,7 @@ - + diff --git a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj index 04b8a7dc734f0..780da3ad39081 100644 --- a/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj +++ b/csharp/src/Apache.Arrow.Flight/Apache.Arrow.Flight.csproj @@ -7,7 +7,7 @@ - + diff --git a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs index 1bd4035d5b9da..bd5d9315e9fc4 100644 --- a/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BinaryArray.cs @@ -22,7 +22,7 @@ namespace Apache.Arrow { - public class BinaryArray : Array, IReadOnlyList + public class BinaryArray : Array, IReadOnlyList, ICollection { public class Builder : BuilderBase { @@ -380,5 +380,30 @@ IEnumerator IEnumerable.GetEnumerator() } IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(byte[] item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(byte[] item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(byte[] item) + { + for (int index = 0; index < Length; index++) + { + if (GetBytes(index).SequenceEqual(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(byte[][] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetBytes(srcIndex).ToArray(); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs index e9c5f8979e48f..19d4d0b7ed564 100644 --- a/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/BooleanArray.cs @@ -21,7 +21,7 @@ namespace Apache.Arrow { - public class BooleanArray: Array, IReadOnlyList + public class BooleanArray: Array, IReadOnlyList, ICollection { public class Builder : IArrowArrayBuilder { @@ -188,7 +188,7 @@ public bool GetBoolean(int index) public bool? GetValue(int index) { return IsNull(index) - ? (bool?)null + ? null : BitUtility.GetBit(ValueBuffer.Span, index + Offset); } @@ -205,5 +205,30 @@ public bool GetBoolean(int index) } IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(bool? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(bool? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(bool? item) + { + for (int index = 0; index < Length; index++) + { + if (GetValue(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(bool?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetValue(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs index 6ab4986f573e2..55864e89e2eb3 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date32Array.cs @@ -23,9 +23,9 @@ namespace Apache.Arrow /// The class holds an array of dates in the Date32 format, where each date is /// stored as the number of days since the dawn of (UNIX) time. /// - public class Date32Array : PrimitiveArray, IReadOnlyList + public class Date32Array : PrimitiveArray, IReadOnlyList, ICollection #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { private static readonly DateTime _epochDate = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Unspecified); @@ -40,10 +40,9 @@ public class Builder : DateArrayBuilder { private class DateBuilder : PrimitiveArrayBuilder { - protected override Date32Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Date32Array(valueBuffer, nullBitmapBuffer, length, nullCount, offset); + protected override Date32Array Build(ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, int length, + int nullCount, int offset) => + new(valueBuffer, nullBitmapBuffer, length, nullCount, offset); } /// @@ -149,6 +148,31 @@ public Date32Array(ArrayData data) yield return GetDateOnly(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateOnly(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateOnly(srcIndex); + } + } #endif int IReadOnlyCollection.Count => Length; @@ -160,7 +184,32 @@ public Date32Array(ArrayData data) for (int index = 0; index < Length; index++) { yield return GetDateTime(index); - }; + } + } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateTime? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateTime?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateTime(srcIndex); + } } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs index 43e698e10b25c..77538ce59ffae 100644 --- a/csharp/src/Apache.Arrow/Arrays/Date64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Date64Array.cs @@ -24,9 +24,9 @@ namespace Apache.Arrow /// stored as the number of milliseconds since the dawn of (UNIX) time, excluding leap seconds, in multiples of /// 86400000. /// - public class Date64Array : PrimitiveArray, IReadOnlyList + public class Date64Array : PrimitiveArray, IReadOnlyList, ICollection #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { private const long MillisecondsPerDay = 86400000; @@ -45,10 +45,9 @@ public class Builder : DateArrayBuilder { private class DateBuilder : PrimitiveArrayBuilder { - protected override Date64Array Build( - ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, - int length, int nullCount, int offset) => - new Date64Array(valueBuffer, nullBitmapBuffer, length, nullCount, offset); + protected override Date64Array Build(ArrowBuffer valueBuffer, ArrowBuffer nullBitmapBuffer, int length, + int nullCount, int offset) => + new(valueBuffer, nullBitmapBuffer, length, nullCount, offset); } /// @@ -151,6 +150,31 @@ public Date64Array(ArrayData data) yield return GetDateOnly(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateOnly(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateOnly(srcIndex); + } + } #endif int IReadOnlyCollection.Count => Length; @@ -162,7 +186,32 @@ public Date64Array(ArrayData data) for (int index = 0; index < Length; index++) { yield return GetDateTime(index); - }; + } + } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateTime? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateTime? item) + { + for (int index = 0; index < Length; index++) + { + if (GetDateTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateTime?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetDateTime(srcIndex); + } } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs index fa6f765475240..52bfb9eb20768 100644 --- a/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Decimal256Array.cs @@ -13,6 +13,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#nullable enable + using System; using System.Collections; using System.Collections.Generic; @@ -23,7 +25,7 @@ namespace Apache.Arrow { - public class Decimal256Array : FixedSizeBinaryArray, IReadOnlyList, IReadOnlyList + public class Decimal256Array : FixedSizeBinaryArray, IReadOnlyList, IReadOnlyList { public class Builder : BuilderBase { @@ -178,7 +180,7 @@ public Decimal256Array(ArrayData data) return list; } - public string GetString(int index) + public string? GetString(int index) { if (IsNull(index)) { @@ -230,10 +232,10 @@ public bool TryGetSqlDecimal(int index, out SqlDecimal? value) } } - int IReadOnlyCollection.Count => Length; - string? IReadOnlyList.this[int index] => GetString(index); + int IReadOnlyCollection.Count => Length; + string? IReadOnlyList.this[int index] => GetString(index); - IEnumerator IEnumerable.GetEnumerator() + IEnumerator IEnumerable.GetEnumerator() { for (int index = 0; index < Length; index++) { @@ -241,6 +243,6 @@ IEnumerator IEnumerable.GetEnumerator() } } - IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs b/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs index de4fc42b4cf92..3949af877b0c5 100644 --- a/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/IntervalArray.cs @@ -31,7 +31,7 @@ internal static class IntervalArray } public abstract class IntervalArray : PrimitiveArray - where T : struct + where T : struct, IEquatable { protected IntervalArray(ArrayData data) : base(data) diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs index 0456c5cc65ba4..05d659b5270ad 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArray.cs @@ -20,8 +20,8 @@ namespace Apache.Arrow { - public abstract class PrimitiveArray : Array, IReadOnlyList - where T : struct + public abstract class PrimitiveArray : Array, IReadOnlyList, ICollection + where T : struct, IEquatable { protected PrimitiveArray(ArrayData data) : base(data) @@ -40,7 +40,7 @@ protected PrimitiveArray(ArrayData data) { throw new ArgumentOutOfRangeException(nameof(index)); } - return IsValid(index) ? Values[index] : (T?)null; + return IsValid(index) ? Values[index] : null; } public IList ToList(bool includeNulls = false) @@ -86,5 +86,36 @@ IEnumerator IEnumerable.GetEnumerator() yield return IsValid(index) ? Values[index] : null; } } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(T? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(T? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(T? item) + { + if (item == null) + { + return NullCount > 0; + } + + ReadOnlySpan values = Values; + while (values.Length > 0) + { + int index = Values.IndexOf(item.Value); + if (index < 0 || IsValid(index)) { return index >= 0; } + values = values.Slice(index + 1); + } + return false; + } + + void ICollection.CopyTo(T?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetValue(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs index 67fe46633c18f..ae02173fb0df4 100644 --- a/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs +++ b/csharp/src/Apache.Arrow/Arrays/PrimitiveArrayBuilder.cs @@ -20,7 +20,7 @@ namespace Apache.Arrow { - public abstract class PrimitiveArrayBuilder : IArrowArrayBuilder + public abstract class PrimitiveArrayBuilder : IArrowArrayBuilder where TTo : struct where TArray : IArrowArray where TBuilder : class, IArrowArrayBuilder diff --git a/csharp/src/Apache.Arrow/Arrays/StringArray.cs b/csharp/src/Apache.Arrow/Arrays/StringArray.cs index a3ec596adc7ba..ab44805d8d1e9 100644 --- a/csharp/src/Apache.Arrow/Arrays/StringArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/StringArray.cs @@ -22,7 +22,7 @@ namespace Apache.Arrow { - public class StringArray: BinaryArray, IReadOnlyList + public class StringArray: BinaryArray, IReadOnlyList, ICollection { public static readonly Encoding DefaultEncoding = Encoding.UTF8; @@ -164,5 +164,30 @@ IEnumerator IEnumerable.GetEnumerator() } IEnumerator IEnumerable.GetEnumerator() => ((IEnumerable)this).GetEnumerator(); + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(string item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(string item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(string item) + { + for (int index = 0; index < Length; index++) + { + if (GetString(index) == item) + return true; + } + + return false; + } + + void ICollection.CopyTo(string[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetString(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs index e9c2d7a4d9b28..63c0898935ba5 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time32Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time32Array.cs @@ -26,7 +26,7 @@ namespace Apache.Arrow /// public class Time32Array : PrimitiveArray #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { /// @@ -171,6 +171,31 @@ public Time32Array(ArrayData data) yield return GetTime(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(TimeOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(TimeOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetTime(srcIndex); + } + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs index fc18dfb8bf726..5518462952050 100644 --- a/csharp/src/Apache.Arrow/Arrays/Time64Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Time64Array.cs @@ -26,7 +26,7 @@ namespace Apache.Arrow /// public class Time64Array : PrimitiveArray #if NET6_0_OR_GREATER - , IReadOnlyList + , IReadOnlyList, ICollection #endif { /// @@ -162,6 +162,31 @@ public Time64Array(ArrayData data) yield return GetTime(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(TimeOnly? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(TimeOnly? item) + { + for (int index = 0; index < Length; index++) + { + if (GetTime(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(TimeOnly?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetTime(srcIndex); + } + } #endif } } diff --git a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs index ccb656854a5df..b83860584707e 100644 --- a/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/TimestampArray.cs @@ -21,7 +21,7 @@ namespace Apache.Arrow { - public class TimestampArray : PrimitiveArray, IReadOnlyList + public class TimestampArray : PrimitiveArray, IReadOnlyList, ICollection { private static readonly DateTimeOffset s_epoch = new DateTimeOffset(1970, 1, 1, 0, 0, 0, 0, TimeSpan.Zero); @@ -157,5 +157,30 @@ public DateTimeOffset GetTimestampUnchecked(int index) yield return GetTimestamp(index); }; } + + int ICollection.Count => Length; + bool ICollection.IsReadOnly => true; + void ICollection.Add(DateTimeOffset? item) => throw new NotSupportedException("Collection is read-only."); + bool ICollection.Remove(DateTimeOffset? item) => throw new NotSupportedException("Collection is read-only."); + void ICollection.Clear() => throw new NotSupportedException("Collection is read-only."); + + bool ICollection.Contains(DateTimeOffset? item) + { + for (int index = 0; index < Length; index++) + { + if (GetTimestamp(index).Equals(item)) + return true; + } + + return false; + } + + void ICollection.CopyTo(DateTimeOffset?[] array, int arrayIndex) + { + for (int srcIndex = 0, destIndex = arrayIndex; srcIndex < Length; srcIndex++, destIndex++) + { + array[destIndex] = GetTimestamp(srcIndex); + } + } } } diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index b11479c0d4460..c66569afeba85 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -148,7 +148,7 @@ public void VisitArray(IArrowArray array) public void Visit(MonthDayNanosecondIntervalArray array) => VisitPrimitiveArray(array); private void VisitPrimitiveArray(PrimitiveArray array) - where T : struct + where T : struct, IEquatable { _buffers.Add(CreateBitmapBuffer(array.NullBitmapBuffer, array.Offset, array.Length)); _buffers.Add(CreateSlicedBuffer(array.ValueBuffer, array.Offset, array.Length)); diff --git a/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs b/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs index 8f0210b28240f..d42ee5279e795 100644 --- a/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs +++ b/csharp/src/Apache.Arrow/Memory/NativeMemoryManager.cs @@ -40,10 +40,12 @@ internal NativeMemoryManager(INativeAllocationOwner owner, IntPtr ptr, int offse _owner = owner; } +#pragma warning disable CA2015 // TODO: is this correct? ~NativeMemoryManager() { Dispose(false); } +#pragma warning restore CA2015 public override unsafe Span GetSpan() { diff --git a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj index b386ccf79c12c..2b1720561004e 100644 --- a/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj +++ b/csharp/test/Apache.Arrow.Compression.Tests/Apache.Arrow.Compression.Tests.csproj @@ -8,8 +8,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj index ae6f9f1e69667..c8fb40f2d6702 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/Apache.Arrow.Flight.Sql.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs b/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs index 4ad5bde0874a8..e5e64b073f799 100644 --- a/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs +++ b/csharp/test/Apache.Arrow.Flight.Sql.Tests/FlightSqlServerTests.cs @@ -14,6 +14,7 @@ // limitations under the License. #nullable enable + using System; using System.Collections.Generic; using System.Collections.ObjectModel; @@ -65,7 +66,7 @@ public async Task EnsureTheCorrectActionsAreGiven() var streamWriter = new MockServerStreamWriter(); //When - await producer.ListActions(streamWriter, new MockServerCallContext()).ConfigureAwait(false); + await producer.ListActions(streamWriter, new MockServerCallContext()); var actions = streamWriter.Messages.ToArray(); Assert.Equal(FlightSqlUtils.FlightSqlActions, actions); @@ -115,7 +116,7 @@ public void EnsureTableSchemaIsCorrectWithoutTableSchema(bool includeTableSchema [InlineData(typeof(CommandGetImportedKeys), "GetImportedKeysFlightInfo")] [InlineData(typeof(CommandGetCrossReference), "GetCrossReferenceFlightInfo")] [InlineData(typeof(CommandGetXdbcTypeInfo), "GetXdbcTypeFlightInfo")] - public async void EnsureGetFlightInfoIsCorrectlyRoutedForCommand(Type commandType, string expectedResult) + public async Task EnsureGetFlightInfoIsCorrectlyRoutedForCommand(Type commandType, string expectedResult) { //Given var command = (IMessage) Activator.CreateInstance(commandType)!; @@ -131,7 +132,7 @@ public async void EnsureGetFlightInfoIsCorrectlyRoutedForCommand(Type commandTyp [Fact] - public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupportedAndHasNoDescriptor() + public async Task EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupportedAndHasNoDescriptor() { //Given var producer = new TestFlightSqlSever(); @@ -145,7 +146,7 @@ public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupp } [Fact] - public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupported() + public async Task EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupported() { //Given var producer = new TestFlightSqlSever(); @@ -175,7 +176,7 @@ public async void EnsureAnInvalidOperationExceptionIsThrownWhenACommandIsNotSupp [InlineData(typeof(CommandGetImportedKeys), "DoGetImportedKeys")] [InlineData(typeof(CommandGetCrossReference), "DoGetCrossReference")] [InlineData(typeof(CommandGetXdbcTypeInfo), "DoGetXbdcTypeInfo")] - public async void EnsureDoGetIsCorrectlyRoutedForADoGetCommand(Type commandType, string expectedResult) + public async Task EnsureDoGetIsCorrectlyRoutedForADoGetCommand(Type commandType, string expectedResult) { //Given var producer = new TestFlightSqlSever(); @@ -192,7 +193,7 @@ public async void EnsureDoGetIsCorrectlyRoutedForADoGetCommand(Type commandType, } [Fact] - public async void EnsureAnInvalidOperationExceptionIsThrownWhenADoGetCommandIsNotSupported() + public async Task EnsureAnInvalidOperationExceptionIsThrownWhenADoGetCommandIsNotSupported() { //Given var producer = new TestFlightSqlSever(); @@ -213,7 +214,7 @@ public async void EnsureAnInvalidOperationExceptionIsThrownWhenADoGetCommandIsNo [InlineData(SqlAction.CloseRequest, typeof(ActionClosePreparedStatementRequest), "ClosePreparedStatement")] [InlineData(SqlAction.CreateRequest, typeof(ActionCreatePreparedStatementRequest), "CreatePreparedStatement")] [InlineData("BadCommand", typeof(ActionCreatePreparedStatementRequest), "Action type BadCommand not supported", true)] - public async void EnsureDoActionIsCorrectlyRoutedForAnActionRequest(string actionType, Type actionBodyType, string expectedResponse, bool isException = false) + public async Task EnsureDoActionIsCorrectlyRoutedForAnActionRequest(string actionType, Type actionBodyType, string expectedResponse, bool isException = false) { //Given var producer = new TestFlightSqlSever(); @@ -237,19 +238,19 @@ public async void EnsureDoActionIsCorrectlyRoutedForAnActionRequest(string actio [InlineData(typeof(CommandPreparedStatementQuery), "PutPreparedStatementQuery")] [InlineData(typeof(CommandPreparedStatementUpdate), "PutPreparedStatementUpdate")] [InlineData(typeof(CommandGetXdbcTypeInfo), "Command CommandGetXdbcTypeInfo not supported", true)] - public async void EnsureDoPutIsCorrectlyRoutedForTheCommand(Type commandType, string expectedResponse, bool isException = false) + public async Task EnsureDoPutIsCorrectlyRoutedForTheCommand(Type commandType, string expectedResponse, bool isException = false) { //Given var command = (IMessage) Activator.CreateInstance(commandType)!; var producer = new TestFlightSqlSever(); var descriptor = FlightDescriptor.CreateCommandDescriptor(command.PackAndSerialize().ToArray()); var recordBatch = new RecordBatch(new Schema(new List(), null), System.Array.Empty(), 0); - var reader = new MockStreamReader(await recordBatch.ToFlightData(descriptor).ConfigureAwait(false)); + var reader = new MockStreamReader(await recordBatch.ToFlightData(descriptor)); var batchReader = new FlightServerRecordBatchStreamReader(reader); var mockStreamWriter = new MockServerStreamWriter(); //When - async Task Act() => await producer.DoPut(batchReader, mockStreamWriter, new MockServerCallContext()).ConfigureAwait(false); + async Task Act() => await producer.DoPut(batchReader, mockStreamWriter, new MockServerCallContext()); var exception = await Record.ExceptionAsync(Act); string? actualMessage = isException ? exception?.Message : mockStreamWriter.Messages[0].ApplicationMetadata.ToStringUtf8(); @@ -271,7 +272,7 @@ private class MockServerCallContext : ServerCallContext protected override CancellationToken CancellationTokenCore => default; protected override Metadata ResponseTrailersCore => new(); protected override Status StatusCore { get; set; } - protected override WriteOptions WriteOptionsCore { get; set; } = WriteOptions.Default; + protected override WriteOptions? WriteOptionsCore { get; set; } = WriteOptions.Default; protected override AuthContext AuthContextCore => new("", new Dictionary>()); } } @@ -325,7 +326,7 @@ public static async Task GetSchema(this IEnumerable flightDa public static async Task> ToFlightData(this RecordBatch recordBatch, FlightDescriptor? descriptor = null) { var responseStream = new MockFlightServerRecordBatchStreamWriter(); - await responseStream.WriteRecordBatchAsync(recordBatch).ConfigureAwait(false); + await responseStream.WriteRecordBatchAsync(recordBatch); if (descriptor == null) { return responseStream.FlightData; diff --git a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj index ed158ca8656d3..ba60451f25f68 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj +++ b/csharp/test/Apache.Arrow.Flight.Tests/Apache.Arrow.Flight.Tests.csproj @@ -7,8 +7,8 @@ - - + + diff --git a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs index ebc38354b5c28..aac4e4209240a 100644 --- a/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs +++ b/csharp/test/Apache.Arrow.Flight.Tests/FlightTests.cs @@ -288,9 +288,9 @@ public async Task TestHandshake() { var duplexStreamingCall = _flightClient.Handshake(); - await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.Empty)); + await duplexStreamingCall.RequestStream.CompleteAsync(); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); Assert.Single(results); Assert.Equal("Done", results.First().Payload.ToStringUtf8()); @@ -303,10 +303,10 @@ public async Task TestSingleExchange() var duplexStreamingCall = _flightClient.DoExchange(flightDescriptor); var expectedBatch = CreateTestBatch(0, 100); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch); + await duplexStreamingCall.RequestStream.CompleteAsync(); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); Assert.Single(results); ArrowReaderVerifier.CompareBatches(expectedBatch, results.FirstOrDefault()); @@ -320,11 +320,11 @@ public async Task TestMultipleExchange() var expectedBatch1 = CreateTestBatch(0, 100); var expectedBatch2 = CreateTestBatch(100, 100); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch1).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch2).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch1); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch2); + await duplexStreamingCall.RequestStream.CompleteAsync(); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); ArrowReaderVerifier.CompareBatches(expectedBatch1, results[0]); ArrowReaderVerifier.CompareBatches(expectedBatch2, results[1]); @@ -338,8 +338,8 @@ public async Task TestExchangeWithMetadata() var expectedBatch = CreateTestBatch(0, 100); var expectedMetadata = ByteString.CopyFromUtf8("test metadata"); - await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch, expectedMetadata).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(expectedBatch, expectedMetadata); + await duplexStreamingCall.RequestStream.CompleteAsync(); List actualMetadata = new List(); List actualBatch = new List(); @@ -358,9 +358,9 @@ public async Task TestHandshakeWithSpecificMessage() { var duplexStreamingCall = _flightClient.Handshake(); - await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.CopyFromUtf8("Hello"))).ConfigureAwait(false); - await duplexStreamingCall.RequestStream.CompleteAsync().ConfigureAwait(false); - var results = await duplexStreamingCall.ResponseStream.ToListAsync().ConfigureAwait(false); + await duplexStreamingCall.RequestStream.WriteAsync(new FlightHandshakeRequest(ByteString.CopyFromUtf8("Hello"))); + await duplexStreamingCall.RequestStream.CompleteAsync(); + var results = await duplexStreamingCall.ResponseStream.ToListAsync(); Assert.Single(results); Assert.Equal("Hello handshake", results.First().Payload.ToStringUtf8()); diff --git a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs index 31a5676f01315..7232f74b8bec6 100644 --- a/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs +++ b/csharp/test/Apache.Arrow.IntegrationTest/JsonFile.cs @@ -908,8 +908,8 @@ private static byte[] ConvertHexStringToByteArray(string hexString) }; private void GenerateArray(Func createArray) + where T : struct, IEquatable where TArray : PrimitiveArray - where T : struct { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -929,8 +929,8 @@ private void GenerateArray(Func(Func createArray, Func parse) + where T : struct, IEquatable where TArray : PrimitiveArray - where T : struct { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); @@ -950,8 +950,8 @@ private void GenerateLongArray(Func(Func createArray, Func construct) + where T : struct, IEquatable where TArray : PrimitiveArray - where T : struct { ArrowBuffer validityBuffer = GetValidityBuffer(out int nullCount); diff --git a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj index 06fb44e0a0e88..90b498d4e9b03 100644 --- a/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj +++ b/csharp/test/Apache.Arrow.Tests/Apache.Arrow.Tests.csproj @@ -17,8 +17,8 @@ - - + + all runtime; build; native; contentfiles; analyzers diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index 682ebec323dc0..c3c21c412d20d 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -101,9 +101,9 @@ public void EnumerateArray() { var array = new Int64Array.Builder().Append(1).Append(2).Build(); - foreach(long? foo in (IEnumerable)array) + foreach(long? foo in array) { - Assert.InRange(foo.Value, 1, 2); + Assert.InRange(foo!.Value, 1, 2); } foreach (object foo in (IEnumerable)array) @@ -115,12 +115,145 @@ public void EnumerateArray() [Fact] public void ArrayAsReadOnlyList() { - Int64Array array = new Int64Array.Builder().Append(1).Append(2).Build(); - var readOnlyList = (IReadOnlyList)array; + TestArrayAsReadOnlyList([1, 2]); + TestArrayAsReadOnlyList([1, 2]); + TestArrayAsReadOnlyList([true, false]); + TestArrayAsReadOnlyList([DateTime.MinValue.Date, DateTime.MaxValue.Date]); + TestArrayAsReadOnlyList([DateTime.MinValue.Date, DateTime.MaxValue.Date]); + TestArrayAsReadOnlyList([DateTimeOffset.MinValue, DateTimeOffset.MinValue.AddYears(100)]); + +#if NET5_0_OR_GREATER + TestArrayAsReadOnlyList([DateOnly.MinValue, DateOnly.MaxValue]); + TestArrayAsReadOnlyList([DateOnly.MinValue, DateOnly.MaxValue]); + TestArrayAsReadOnlyList([TimeOnly.MinValue, TimeOnly.MinValue.AddHours(23)]); + TestArrayAsReadOnlyList([TimeOnly.MinValue, TimeOnly.MaxValue]); + TestArrayAsReadOnlyList([(Half)1.1, (Half)2.2f]); +#endif + } + + // Parameter 'values' must contain two distinct values + private static void TestArrayAsReadOnlyList(IReadOnlyList values) + where T : struct + where TArray : IArrowArray + where TArrayBuilder : IArrowArrayBuilder, new() + { + Assert.Equal(2, values.Count); + TArray array = new TArrayBuilder().Append(values[0]).AppendNull().Append(values[1]).Build(default); + Assert.NotNull(array); + var readOnlyList = (IReadOnlyList)array; Assert.Equal(array.Length, readOnlyList.Count); - Assert.Equal(readOnlyList[0], 1); - Assert.Equal(readOnlyList[1], 2); + Assert.Equal(3, readOnlyList.Count); + Assert.Equal(values[0], readOnlyList[0]); + Assert.Null(readOnlyList[1]); + Assert.Equal(values[1], readOnlyList[2]); + } + + [Fact] + public void ArrayAsCollection() + { + TestPrimitiveArrayAsCollection([1, 2, 3, 4]); + TestPrimitiveArrayAsCollection([1, 2, 3, 4]); + TestPrimitiveArrayAsCollection([true, true, true, false]); + TestPrimitiveArrayAsCollection([DateTime.MinValue.Date, DateTime.MaxValue.Date, DateTime.Today, DateTime.Today]); + TestPrimitiveArrayAsCollection([DateTime.MinValue.Date, DateTime.MaxValue.Date, DateTime.Today, DateTime.Today]); + TestPrimitiveArrayAsCollection([DateTimeOffset.MinValue, DateTimeOffset.MinValue.AddYears(100), DateTimeOffset.Now, DateTimeOffset.UtcNow]); + +#if NET5_0_OR_GREATER + TestPrimitiveArrayAsCollection([DateOnly.MinValue, DateOnly.MaxValue, DateOnly.FromDayNumber(1), DateOnly.FromDayNumber(2)]); + TestPrimitiveArrayAsCollection([DateOnly.MinValue, DateOnly.MaxValue, DateOnly.FromDayNumber(1), DateOnly.FromDayNumber(2)]); + TestPrimitiveArrayAsCollection([TimeOnly.MinValue, TimeOnly.MinValue.AddHours(23), TimeOnly.MinValue.AddHours(1), TimeOnly.MinValue.AddHours(2)]); + TestPrimitiveArrayAsCollection([TimeOnly.MinValue, TimeOnly.MaxValue, TimeOnly.MinValue.AddHours(1), TimeOnly.MinValue.AddHours(2)]); + TestPrimitiveArrayAsCollection([(Half)1.1, (Half)2.2f, (Half)3.3f, (Half)4.4f]); +#endif + + byte[][] byteArrs = [new byte[1], [], [255], new byte[2]]; + TestObjectArrayAsCollection(new BinaryArray.Builder().Append(byteArrs[0].AsEnumerable()).AppendNull().Append(byteArrs[1].AsEnumerable()).Append(byteArrs[0].AsEnumerable()).Build(), System.Array.Empty(), byteArrs); + + string[] strings = ["abc", "abd", "acd", "adc"]; + TestObjectArrayAsCollection(new StringArray.Builder().Append(strings[0]).AppendNull().Append(strings[1]).Append(strings[0]).Build(), null, strings); + } + + // Parameter 'values' must contain four values. The last value must be distinct from the rest. + private static void TestPrimitiveArrayAsCollection(IReadOnlyList values) + where T : struct + where TArray : IArrowArray, ICollection + where TArrayBuilder : IArrowArrayBuilder, new() + { + Assert.Equal(4, values.Count); + TArray array = new TArrayBuilder().Append(values[0]).AppendNull().Append(values[1]).Append(values[0]).Build(default); + Assert.NotNull(array); + var collection = (ICollection)array; + + Assert.Equal(array.Length, collection.Count); + Assert.Equal(4, collection.Count); + Assert.True(collection.IsReadOnly); + + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Add(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Remove(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(collection.Clear).Message); + + Assert.True(collection.Contains(values[0])); + Assert.True(collection.Contains(values[1])); + Assert.True(collection.Contains(default)); + Assert.False(collection.Contains(values[3])); + + T sentinel = values[2]; + T?[] destArr = { sentinel, sentinel, sentinel, sentinel, sentinel, sentinel }; + collection.CopyTo(destArr, 1); + Assert.Equal(sentinel, destArr[0]); + Assert.Equal(values[0], destArr[1]); + Assert.Null(destArr[2]); + Assert.Equal(values[1], destArr[3]); + Assert.Equal(values[0], destArr[4]); + Assert.Equal(sentinel, destArr[0]); + } + + // Parameter 'values' must contain four values. The last value must be distinct from the rest. + private static void TestObjectArrayAsCollection(TArray array, T nullValue, IReadOnlyList values) + where T : class + where TArray : IArrowArray, ICollection + { + Assert.NotNull(array); + Assert.Equal(4, values.Count); + var collection = (ICollection)array; + + Assert.Equal(array.Length, collection.Count); + Assert.Equal(4, collection.Count); + Assert.True(collection.IsReadOnly); + + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Add(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(() => collection.Remove(values[3])).Message); + Assert.Equal("Collection is read-only.", Assert.Throws(collection.Clear).Message); + + Assert.True(collection.Contains(values[0])); + Assert.True(collection.Contains(values[1])); + Assert.True(collection.Contains(default)); + Assert.False(collection.Contains(values[3])); + + T sentinel = values[2]; + T[] destArr = { sentinel, sentinel, sentinel, sentinel, sentinel, sentinel }; + collection.CopyTo(destArr, 1); + Assert.Equal(sentinel, destArr[0]); + Assert.Equal(values[0], destArr[1]); + Assert.Equal(nullValue, destArr[2]); + Assert.Equal(values[1], destArr[3]); + Assert.Equal(values[0], destArr[4]); + Assert.Equal(sentinel, destArr[0]); + } + + [Fact] + public void ContainsDoesNotMatchDefaultValueInArrayWithNullValue() + { + Int64Array array = new Int64Array.Builder().Append(1).Append(2).AppendNull().Build(); + Assert.NotNull(array); + var collection = (ICollection)array; + + Assert.True(collection.Contains(1)); + Assert.True(collection.Contains(2)); + Assert.True(collection.Contains(default)); + // A null value is stored as a null bit in the null bitmap, and a default value in the value buffer. Check that we do not match the default value. + Assert.False(collection.Contains(0)); } [Fact] diff --git a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs index 2a674b942c17b..6e4742cad06f2 100644 --- a/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/Date32ArrayTests.cs @@ -131,7 +131,7 @@ public void AppendGivesUtcDate(DateTimeOffset dateTimeOffset) public class AppendDateOnly { [Theory] - [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date64ArrayTests))] + [MemberData(nameof(GetDateOnlyData), MemberType = typeof(Date32ArrayTests))] public void AppendDateGivesSameDate(DateOnly date) { // Arrange diff --git a/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs b/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs index 59080d739b10b..412f67de5f0fb 100644 --- a/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/DurationArrayTests.cs @@ -115,7 +115,7 @@ public void AppendTimeSpanGivesSameTimeSpan(TimeSpan? timeSpan, DurationType typ Assert.Equal(timeSpan, array.GetTimeSpan(0)); IReadOnlyList asList = array; - Assert.Equal(1, asList.Count); + Assert.Single(asList); Assert.Equal(timeSpan, asList[0]); } } diff --git a/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs b/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs index 4375c39cdfaf6..01809735d14c9 100644 --- a/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs +++ b/csharp/test/Apache.Arrow.Tests/Extensions/DateTimeOffsetExtensions.cs @@ -14,8 +14,6 @@ // limitations under the License. using System; -using System.Collections.Generic; -using System.Text; namespace Apache.Arrow.Tests { diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs index 712a87a252b6c..c603ef63a4d3e 100644 --- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs @@ -110,7 +110,7 @@ private static void CompareValue(UnionArray originalArray, int originalIndex, Un } private static void CompareFieldValue(byte typeId, UnionArray originalArray, int originalIndex, UnionArray slicedArray, int sliceIndex) - where T: struct + where T : struct, IEquatable where TArray : PrimitiveArray { if (originalArray is DenseUnionArray denseOriginalArray) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 5fa41e28a3208..cd746f9c4499a 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -261,6 +261,7 @@ def build(ctx, src, build_dir, force, targets, **kwargs): "Check all sources files for license texts via Apache RAT."), LintCheck('r', "Lint R files."), LintCheck('docker', "Lint Dockerfiles with hadolint."), + LintCheck('docs', "Lint docs with sphinx-lint."), ] @@ -285,9 +286,10 @@ def decorate_lint_command(cmd): help="Run IWYU on all C++ files if enabled") @click.option("-a", "--all", is_flag=True, default=False, help="Enable all checks.") +@click.argument("path", required=False) @decorate_lint_command @click.pass_context -def lint(ctx, src, fix, iwyu_all, **checks): +def lint(ctx, src, fix, iwyu_all, path, **checks): if checks.pop('all'): # "--all" is given => enable all non-selected checks for k, v in checks.items(): @@ -297,7 +299,7 @@ def lint(ctx, src, fix, iwyu_all, **checks): raise click.UsageError( "Need to enable at least one lint check (try --help)") try: - linter(src, fix, iwyu_all=iwyu_all, **checks) + linter(src, fix, iwyu_all=iwyu_all, path=path, **checks) except LintValidationException: sys.exit(1) @@ -736,6 +738,9 @@ def _set_default(opt, default): help='Include JavaScript in integration tests') @click.option('--with-go', type=bool, default=False, help='Include Go in integration tests') +@click.option('--with-nanoarrow', type=bool, default=False, + help='Include nanoarrow in integration tests', + envvar="ARCHERY_INTEGRATION_WITH_NANOARROW") @click.option('--with-rust', type=bool, default=False, help='Include Rust in integration tests', envvar="ARCHERY_INTEGRATION_WITH_RUST") @@ -774,7 +779,7 @@ def integration(with_all=False, random_seed=12345, **args): gen_path = args['write_generated_json'] - languages = ['cpp', 'csharp', 'java', 'js', 'go', 'rust'] + languages = ['cpp', 'csharp', 'java', 'js', 'go', 'nanoarrow', 'rust'] formats = ['ipc', 'flight', 'c_data'] enabled_languages = 0 diff --git a/dev/archery/archery/docker/core.py b/dev/archery/archery/docker/core.py index 7376bb0a3b72d..cb831060022a4 100644 --- a/dev/archery/archery/docker/core.py +++ b/dev/archery/archery/docker/core.py @@ -371,6 +371,10 @@ def run(self, service_name, command=None, *, env=None, volumes=None, v = "{}:{}".format(v['source'], v['target']) args.extend(['-v', v]) + # append capabilities from the compose conf + for c in service.get('cap_add', []): + args.extend([f'--cap-add={c}']) + # infer whether an interactive shell is desired or not if command in ['cmd.exe', 'bash', 'sh', 'powershell']: args.append('-it') diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 5cae907a4aa71..f6302165cd5a0 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1928,17 +1928,20 @@ def _temp_path(): .skip_tester('C#') .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_binary_view_case() .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_list_view_case() .skip_tester('C#') # Doesn't support large list views .skip_tester('Java') .skip_tester('JS') + .skip_tester('nanoarrow') .skip_tester('Rust'), generate_extension_case() diff --git a/dev/archery/archery/integration/runner.py b/dev/archery/archery/integration/runner.py index 5b66842b25926..0ea244720cc1d 100644 --- a/dev/archery/archery/integration/runner.py +++ b/dev/archery/archery/integration/runner.py @@ -36,6 +36,7 @@ from .tester_java import JavaTester from .tester_js import JSTester from .tester_csharp import CSharpTester +from .tester_nanoarrow import NanoarrowTester from .util import guid, printer from .util import SKIP_C_ARRAY, SKIP_C_SCHEMA, SKIP_FLIGHT, SKIP_IPC from ..utils.source import ARROW_ROOT_DEFAULT @@ -541,8 +542,8 @@ def get_static_json_files(): def run_all_tests(with_cpp=True, with_java=True, with_js=True, with_csharp=True, with_go=True, with_rust=False, - run_ipc=False, run_flight=False, run_c_data=False, - tempdir=None, **kwargs): + with_nanoarrow=False, run_ipc=False, run_flight=False, + run_c_data=False, tempdir=None, **kwargs): tempdir = tempdir or tempfile.mkdtemp(prefix='arrow-integration-') testers: List[Tester] = [] @@ -562,6 +563,9 @@ def run_all_tests(with_cpp=True, with_java=True, with_js=True, if with_go: testers.append(GoTester(**kwargs)) + if with_nanoarrow: + testers.append(NanoarrowTester(**kwargs)) + if with_rust: testers.append(RustTester(**kwargs)) diff --git a/dev/archery/archery/integration/tester_java.py b/dev/archery/archery/integration/tester_java.py index 8e7a0bb99f9de..ccc807410a848 100644 --- a/dev/archery/archery/integration/tester_java.py +++ b/dev/archery/archery/integration/tester_java.py @@ -18,17 +18,23 @@ import contextlib import functools import os +from pathlib import Path import subprocess from . import cdata from .tester import Tester, CDataExporter, CDataImporter from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT + + +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) def load_version_from_pom(): import xml.etree.ElementTree as ET - tree = ET.parse(os.path.join(ARROW_ROOT_DEFAULT, 'java', 'pom.xml')) + tree = ET.parse(os.path.join(ARROW_BUILD_ROOT, 'java', 'pom.xml')) tag_pattern = '{http://maven.apache.org/POM/4.0.0}version' version_tag = list(tree.getroot().findall(tag_pattern))[0] return version_tag.text @@ -48,7 +54,7 @@ def load_version_from_pom(): _ARROW_TOOLS_JAR = os.environ.get( "ARROW_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/tools/target", f"arrow-tools-{_arrow_version}-jar-with-dependencies.jar" ) @@ -56,7 +62,7 @@ def load_version_from_pom(): _ARROW_C_DATA_JAR = os.environ.get( "ARROW_C_DATA_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/c/target", f"arrow-c-data-{_arrow_version}.jar" ) @@ -64,7 +70,7 @@ def load_version_from_pom(): _ARROW_FLIGHT_JAR = os.environ.get( "ARROW_FLIGHT_JAVA_INTEGRATION_JAR", os.path.join( - ARROW_ROOT_DEFAULT, + ARROW_BUILD_ROOT, "java/flight/flight-integration-tests/target", f"flight-integration-tests-{_arrow_version}-jar-with-dependencies.jar" ) diff --git a/dev/archery/archery/integration/tester_js.py b/dev/archery/archery/integration/tester_js.py index c7f363ba54687..3d1a229931cde 100644 --- a/dev/archery/archery/integration/tester_js.py +++ b/dev/archery/archery/integration/tester_js.py @@ -16,13 +16,17 @@ # under the License. import os +from pathlib import Path from .tester import Tester from .util import run_cmd, log -from ..utils.source import ARROW_ROOT_DEFAULT -ARROW_JS_ROOT = os.path.join(ARROW_ROOT_DEFAULT, 'js') +ARROW_BUILD_ROOT = os.environ.get( + 'ARROW_BUILD_ROOT', + Path(__file__).resolve().parents[5] +) +ARROW_JS_ROOT = os.path.join(ARROW_BUILD_ROOT, 'js') _EXE_PATH = os.path.join(ARROW_JS_ROOT, 'bin') _VALIDATE = os.path.join(_EXE_PATH, 'integration.ts') _JSON_TO_ARROW = os.path.join(_EXE_PATH, 'json-to-arrow.ts') diff --git a/dev/archery/archery/integration/tester_nanoarrow.py b/dev/archery/archery/integration/tester_nanoarrow.py new file mode 100644 index 0000000000000..30ff1bb6e50a7 --- /dev/null +++ b/dev/archery/archery/integration/tester_nanoarrow.py @@ -0,0 +1,148 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import functools +import os + +from . import cdata +from .tester import Tester, CDataExporter, CDataImporter +from ..utils.source import ARROW_ROOT_DEFAULT + + +_NANOARROW_PATH = os.environ.get( + "ARROW_NANOARROW_PATH", + os.path.join(ARROW_ROOT_DEFAULT, "nanoarrow/cdata"), +) + +_INTEGRATION_DLL = os.path.join( + _NANOARROW_PATH, "libnanoarrow_c_data_integration" + cdata.dll_suffix +) + + +class NanoarrowTester(Tester): + PRODUCER = False + CONSUMER = False + FLIGHT_SERVER = False + FLIGHT_CLIENT = False + C_DATA_SCHEMA_EXPORTER = True + C_DATA_ARRAY_EXPORTER = True + C_DATA_SCHEMA_IMPORTER = True + C_DATA_ARRAY_IMPORTER = True + + name = "nanoarrow" + + def validate(self, json_path, arrow_path, quirks=None): + raise NotImplementedError() + + def json_to_file(self, json_path, arrow_path): + raise NotImplementedError() + + def stream_to_file(self, stream_path, file_path): + raise NotImplementedError() + + def file_to_stream(self, file_path, stream_path): + raise NotImplementedError() + + def make_c_data_exporter(self): + return NanoarrowCDataExporter(self.debug, self.args) + + def make_c_data_importer(self): + return NanoarrowCDataImporter(self.debug, self.args) + + +_nanoarrow_c_data_entrypoints = """ + const char* nanoarrow_CDataIntegration_ExportSchemaFromJson( + const char* json_path, struct ArrowSchema* out); + + const char* nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + const char* json_path, struct ArrowSchema* schema); + + const char* nanoarrow_CDataIntegration_ExportBatchFromJson( + const char* json_path, int num_batch, struct ArrowArray* out); + + const char* nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + const char* json_path, int num_batch, struct ArrowArray* batch); + + int64_t nanoarrow_BytesAllocated(void); + """ + + +@functools.lru_cache +def _load_ffi(ffi, lib_path=_INTEGRATION_DLL): + ffi.cdef(_nanoarrow_c_data_entrypoints) + dll = ffi.dlopen(lib_path) + return dll + + +class _CDataBase: + def __init__(self, debug, args): + self.debug = debug + self.args = args + self.ffi = cdata.ffi() + self.dll = _load_ffi(self.ffi) + + def _check_nanoarrow_error(self, na_error): + """ + Check a `const char*` error return from an integration entrypoint. + + A null means success, a non-empty string is an error message. + The string is statically allocated on the nanoarrow side and does not + need to be released. + """ + assert self.ffi.typeof(na_error) is self.ffi.typeof("const char*") + if na_error != self.ffi.NULL: + error = self.ffi.string(na_error).decode("utf8", errors="replace") + raise RuntimeError(f"nanoarrow C Data Integration call failed: {error}") + + +class NanoarrowCDataExporter(CDataExporter, _CDataBase): + def export_schema_from_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportSchemaFromJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def export_batch_from_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ExportBatchFromJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True + + def record_allocation_state(self): + return self.dll.nanoarrow_BytesAllocated() + + +class NanoarrowCDataImporter(CDataImporter, _CDataBase): + def import_schema_and_compare_to_json(self, json_path, c_schema_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportSchemaAndCompareToJson( + str(json_path).encode(), c_schema_ptr + ) + self._check_nanoarrow_error(na_error) + + def import_batch_and_compare_to_json(self, json_path, num_batch, c_array_ptr): + na_error = self.dll.nanoarrow_CDataIntegration_ImportBatchAndCompareToJson( + str(json_path).encode(), num_batch, c_array_ptr + ) + self._check_nanoarrow_error(na_error) + + @property + def supports_releasing_memory(self): + return True diff --git a/dev/archery/archery/utils/lint.py b/dev/archery/archery/utils/lint.py index 15f22ca2e6e5c..108c9ded361e7 100644 --- a/dev/archery/archery/utils/lint.py +++ b/dev/archery/archery/utils/lint.py @@ -436,10 +436,55 @@ def docker_linter(src): cwd=src.path)) -def linter(src, fix=False, *, clang_format=False, cpplint=False, +class SphinxLint(Command): + def __init__(self, src, path=None, sphinx_lint_bin=None, disable=None, enable=None): + self.src = src + self.path = path + self.bin = default_bin(sphinx_lint_bin, "sphinx-lint") + self.disable = disable or "all" + self.enable = enable + + def lint(self, *args, check=False): + docs_path = os.path.join(self.src.path, "docs") + + args = [] + + if self.disable: + args.extend(["--disable", self.disable]) + + if self.enable: + args.extend(["--enable", self.enable]) + + if self.path is not None: + args.extend([self.path]) + else: + args.extend([docs_path]) + + return self.run(*args, check=check) + + +def docs_linter(src, path=None): + """Run sphinx-lint on docs.""" + logger.info("Running docs linter (sphinx-lint)") + + sphinx_lint = SphinxLint( + src, + path=path, + disable="all", + enable="trailing-whitespace,missing-final-newline" + ) + + if not sphinx_lint.available: + logger.error("sphinx-lint linter requested but sphinx-lint binary not found") + return + + yield LintResult.from_cmd(sphinx_lint.lint()) + + +def linter(src, fix=False, path=None, *, clang_format=False, cpplint=False, clang_tidy=False, iwyu=False, iwyu_all=False, python=False, numpydoc=False, cmake_format=False, rat=False, - r=False, docker=False): + r=False, docker=False, docs=False): """Run all linters.""" with tmpdir(prefix="arrow-lint-") as root: build_dir = os.path.join(root, "cpp-build") @@ -481,6 +526,9 @@ def linter(src, fix=False, *, clang_format=False, cpplint=False, if docker: results.extend(docker_linter(src)) + if docs: + results.extend(docs_linter(src, path)) + # Raise error if one linter failed, ensuring calling code can exit with # non-zero. for result in results: diff --git a/dev/archery/setup.py b/dev/archery/setup.py index 23a1600910d04..cd3e2e9ca0834 100755 --- a/dev/archery/setup.py +++ b/dev/archery/setup.py @@ -41,7 +41,7 @@ 'integration': ['cffi'], 'integration-java': ['jpype1'], 'lint': ['numpydoc==1.1.0', 'autopep8', 'flake8==6.1.0', 'cython-lint', - 'cmake_format==0.6.13'], + 'cmake_format==0.6.13', 'sphinx-lint==0.9.1'], 'numpydoc': ['numpydoc==1.1.0'], 'release': ['pygithub', jinja_req, 'jira', 'semver', 'gitpython'], } diff --git a/dev/conbench_envs/benchmarks.env b/dev/conbench_envs/benchmarks.env index 2a5a9c32a86ec..3af29491a8345 100644 --- a/dev/conbench_envs/benchmarks.env +++ b/dev/conbench_envs/benchmarks.env @@ -31,7 +31,6 @@ ARROW_HOME=$CONDA_PREFIX ARROW_INSTALL_NAME_RPATH=ON ARROW_JEMALLOC=OFF ARROW_MIMALLOC=ON -ARROW_NO_DEPRECATED_API=ON ARROW_ORC=ON ARROW_PARQUET=ON ARROW_PYTHON=ON diff --git a/dev/conbench_envs/hooks.sh b/dev/conbench_envs/hooks.sh index a77189764aed3..0745357d2c0d3 100755 --- a/dev/conbench_envs/hooks.sh +++ b/dev/conbench_envs/hooks.sh @@ -59,7 +59,8 @@ build_arrow_cpp() { } build_arrow_python() { - ci/scripts/python_build.sh $(pwd) $(pwd) + mkdir -p /tmp/arrow + ci/scripts/python_build.sh $(pwd) /tmp/arrow } build_arrow_r() { @@ -69,7 +70,8 @@ build_arrow_r() { } build_arrow_java() { - ci/scripts/java_build.sh $(pwd) $(pwd) + mkdir -p /tmp/arrow + ci/scripts/java_build.sh $(pwd) /tmp/arrow } install_archery() { diff --git a/dev/release/binary-task.rb b/dev/release/binary-task.rb index 8fcdcf1f5f442..c2386a1f52f21 100644 --- a/dev/release/binary-task.rb +++ b/dev/release/binary-task.rb @@ -1083,7 +1083,6 @@ def apt_release_repositories_dir def available_apt_targets [ - ["debian", "bullseye", "main"], ["debian", "bookworm", "main"], ["debian", "trixie", "main"], ["ubuntu", "focal", "main"], @@ -2111,8 +2110,6 @@ def apt_test_targets_default # Disable arm64 targets by default for now # because they require some setups on host. [ - "debian-bullseye", - # "debian-bullseye-arm64", "debian-bookworm", # "debian-bookworm-arm64", "debian-trixie", diff --git a/dev/release/post-08-docs.sh b/dev/release/post-08-docs.sh index c59f9b96857a6..58a462551f199 100755 --- a/dev/release/post-08-docs.sh +++ b/dev/release/post-08-docs.sh @@ -72,13 +72,28 @@ fi # delete current stable docs and restore all previous versioned docs rm -rf docs/* git checkout "${versioned_paths[@]}" +# Download and untar released docs in a temp folder +rm -rf docs_new +mkdir docs_new +pushd docs_new curl \ --fail \ --location \ --remote-name \ https://apache.jfrog.io/artifactory/arrow/docs/${version}/docs.tar.gz tar xvf docs.tar.gz -rm -f docs.tar.gz +# Update DOCUMENTATION_OPTIONS.show_version_warning_banner +find docs \ + -type f \ + -exec \ + sed -i.bak \ + -e "s/DOCUMENTATION_OPTIONS.show_version_warning_banner = true/DOCUMENTATION_OPTIONS.show_version_warning_banner = false/g" \ + {} \; +find ./ -name '*.bak' -delete +popd +mv docs_new/docs/* docs/ +rm -rf docs_new + if [ "$is_major_release" = "yes" ] ; then previous_series=${previous_version%.*} mv docs_temp docs/${previous_series} diff --git a/dev/release/verify-apt.sh b/dev/release/verify-apt.sh index cbb6d93823b21..8c54fe5c11cf1 100755 --- a/dev/release/verify-apt.sh +++ b/dev/release/verify-apt.sh @@ -80,12 +80,6 @@ esac workaround_missing_packages=() case "${distribution}-${code_name}" in - debian-bullseye) - sed \ - -i"" \ - -e "s/ main$/ main contrib non-free/g" \ - /etc/apt/sources.list - ;; debian-*) sed \ -i"" \ diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 95be4800f7ffd..3ed871bd5305b 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -185,9 +185,7 @@ test_binary() { test_apt() { show_header "Testing APT packages" - for target in "debian:bullseye" \ - "arm64v8/debian:bullseye" \ - "debian:bookworm" \ + for target in "debian:bookworm" \ "arm64v8/debian:bookworm" \ "debian:trixie" \ "arm64v8/debian:trixie" \ diff --git a/dev/tasks/java-jars/github.yml b/dev/tasks/java-jars/github.yml index 0437ee7864979..eb9478ebaa6ef 100644 --- a/dev/tasks/java-jars/github.yml +++ b/dev/tasks/java-jars/github.yml @@ -80,7 +80,7 @@ jobs: fail-fast: false matrix: platform: - - { runs_on: ["macos-latest"], arch: "x86_64"} + - { runs_on: ["macos-13"], arch: "x86_64"} - { runs_on: ["macos-14"], arch: "aarch_64" } env: MACOSX_DEPLOYMENT_TARGET: "10.15" diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile b/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile deleted file mode 100644 index b0842a0c0d6ff..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/apt/debian-bullseye/Dockerfile +++ /dev/null @@ -1,41 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -FROM debian:bullseye - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -ARG DEBUG - -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - debhelper \ - devscripts \ - fakeroot \ - gnupg \ - lsb-release && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog index 60e745301d9db..04aa586dc3c96 100644 --- a/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow-apt-source/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow-apt-source (16.1.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Thu, 09 May 2024 07:21:29 -0000 + apache-arrow-apt-source (16.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in index 676c9e0d16dea..f0eb785dd6bc7 100644 --- a/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in +++ b/dev/tasks/linux-packages/apache-arrow-release/yum/apache-arrow-release.spec.in @@ -102,6 +102,9 @@ else fi %changelog +* Thu May 09 2024 Raúl Cumplido - 16.1.0-1 +- New upstream release. + * Tue Apr 16 2024 Raúl Cumplido - 16.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from b/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from deleted file mode 100644 index 34187b2af5a74..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye-arm64/from +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -arm64v8/debian:bullseye diff --git a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile b/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile deleted file mode 100644 index 2edcd4d5ed216..0000000000000 --- a/dev/tasks/linux-packages/apache-arrow/apt/debian-bullseye/Dockerfile +++ /dev/null @@ -1,87 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -ARG FROM=debian:bullseye -FROM ${FROM} - -RUN \ - echo "debconf debconf/frontend select Noninteractive" | \ - debconf-set-selections - -RUN \ - echo "deb http://deb.debian.org/debian bullseye-backports main" > \ - /etc/apt/sources.list.d/backports.list - -RUN \ - echo 'APT::Install-Recommends "false";' > \ - /etc/apt/apt.conf.d/disable-install-recommends - -RUN sed -i'' -e 's/main$/main contrib non-free/g' /etc/apt/sources.list - -ARG DEBUG -RUN \ - quiet=$([ "${DEBUG}" = "yes" ] || echo "-qq") && \ - apt update ${quiet} && \ - apt install -y -V ${quiet} \ - build-essential \ - clang \ - cmake \ - debhelper \ - devscripts \ - git \ - libboost-filesystem-dev \ - libboost-system-dev \ - libbrotli-dev \ - libbz2-dev \ - libc-ares-dev \ - libcurl4-openssl-dev \ - libgirepository1.0-dev \ - libglib2.0-doc \ - libgmock-dev \ - libgoogle-glog-dev \ - libgrpc++-dev \ - libgtest-dev \ - liblz4-dev \ - libprotobuf-dev \ - libprotoc-dev \ - libre2-dev \ - libsnappy-dev \ - libssl-dev \ - libthrift-dev \ - libutf8proc-dev \ - libzstd-dev \ - llvm-dev \ - lsb-release \ - ninja-build \ - nlohmann-json3-dev \ - pkg-config \ - protobuf-compiler-grpc \ - python3-dev \ - python3-pip \ - rapidjson-dev \ - tzdata \ - valac \ - zlib1g-dev && \ - if apt list | grep '^nvidia-cuda-toolkit/'; then \ - apt install -y -V ${quiet} nvidia-cuda-toolkit; \ - fi && \ - apt install -y -V -t bullseye-backports ${quiet} \ - meson && \ - pip3 install gi-docgen && \ - ln -fs /usr/local/bin/gi-docgen /usr/bin && \ - apt clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/dev/tasks/linux-packages/apache-arrow/debian/changelog b/dev/tasks/linux-packages/apache-arrow/debian/changelog index e255e84096e4e..35cc598fe6f87 100644 --- a/dev/tasks/linux-packages/apache-arrow/debian/changelog +++ b/dev/tasks/linux-packages/apache-arrow/debian/changelog @@ -1,3 +1,9 @@ +apache-arrow (16.1.0-1) unstable; urgency=low + + * New upstream release. + + -- Raúl Cumplido Thu, 09 May 2024 07:21:29 -0000 + apache-arrow (16.0.0-1) unstable; urgency=low * New upstream release. diff --git a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in index 3ede1814b865d..c6148e9260586 100644 --- a/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/apache-arrow/yum/arrow.spec.in @@ -881,6 +881,9 @@ Documentation for Apache Parquet GLib. %endif %changelog +* Thu May 09 2024 Raúl Cumplido - 16.1.0-1 +- New upstream release. + * Tue Apr 16 2024 Raúl Cumplido - 16.0.0-1 - New upstream release. diff --git a/dev/tasks/linux-packages/package-task.rb b/dev/tasks/linux-packages/package-task.rb index 3a9e5e48b4585..6bcc397277e3a 100644 --- a/dev/tasks/linux-packages/package-task.rb +++ b/dev/tasks/linux-packages/package-task.rb @@ -267,8 +267,6 @@ def apt_targets_default # Disable arm64 targets by default for now # because they require some setups on host. [ - "debian-bullseye", - # "debian-bullseye-arm64", "debian-bookworm", # "debian-bookworm-arm64", "debian-trixie", diff --git a/dev/tasks/matlab/github.yml b/dev/tasks/matlab/github.yml index 13fa36b501125..963c85f6e11bf 100644 --- a/dev/tasks/matlab/github.yml +++ b/dev/tasks/matlab/github.yml @@ -29,9 +29,9 @@ jobs: - name: Install ninja-build run: sudo apt-get update && sudo apt-get install ninja-build - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -66,9 +66,9 @@ jobs: - name: Install ninja-build run: brew install ninja - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Build MATLAB Interface env: {{ macros.github_set_sccache_envvars()|indent(8) }} @@ -101,9 +101,9 @@ jobs: steps: {{ macros.github_checkout_arrow()|indent }} - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Install sccache shell: bash run: arrow/ci/scripts/install_sccache.sh pc-windows-msvc $(pwd)/sccache @@ -147,16 +147,16 @@ jobs: cp arrow/LICENSE.txt arrow/matlab/install/arrow_matlab/LICENSE.txt cp arrow/NOTICE.txt arrow/matlab/install/arrow_matlab/NOTICE.txt - name: Install MATLAB - uses: matlab-actions/setup-matlab@v1 + uses: matlab-actions/setup-matlab@v2 with: - release: R2023a + release: R2024a - name: Run commands env: MATLABPATH: arrow/matlab/tools ARROW_MATLAB_TOOLBOX_FOLDER: arrow/matlab/install/arrow_matlab ARROW_MATLAB_TOOLBOX_OUTPUT_FOLDER: artifacts/matlab-dist ARROW_MATLAB_TOOLBOX_VERSION: {{ arrow.no_rc_version }} - uses: matlab-actions/run-command@v1 + uses: matlab-actions/run-command@v2 with: command: packageMatlabInterface {{ macros.github_upload_releases(["artifacts/matlab-dist/*.mltbx"])|indent }} diff --git a/dev/tasks/r/azure.linux.yml b/dev/tasks/r/azure.linux.yml index e26a59629fa1a..28893a81728c3 100644 --- a/dev/tasks/r/azure.linux.yml +++ b/dev/tasks/r/azure.linux.yml @@ -38,7 +38,6 @@ jobs: export R_ORG={{ r_org }} export R_IMAGE={{ r_image }} export R_TAG={{ r_tag }} - export DEVTOOLSET_VERSION={{ devtoolset_version|default("") }} export R_CUSTOM_CCACHE={{ r_custom_ccache|default("false") }} docker-compose pull --ignore-pull-failures r docker-compose build r diff --git a/dev/tasks/r/github.packages.yml b/dev/tasks/r/github.packages.yml index 9ca7e59a957de..41d8b230f8bf4 100644 --- a/dev/tasks/r/github.packages.yml +++ b/dev/tasks/r/github.packages.yml @@ -299,14 +299,14 @@ jobs: # choosing a binary on this OS. If libarrow_binary is TRUE, we're on # an OS that is not in the allowlist, so we have to opt-in to use the # binary. Other env vars used in r_docker_configure.sh can be added - # here (like devtoolset) and wired up in the later steps. + # here and wired up in the later steps. - {image: "rhub/ubuntu-clang", libarrow_binary: "TRUE"} # fedora-clang-devel cannot use binaries bc of libc++ (uncomment to see the error) # - {image: "rhub/fedora-clang-devel", libarrow_binary: "TRUE"} - {image: "rhub/ubuntu-release"} # currently ubuntu-22.04 - {image: "rocker/r-ver:4.0.0"} # ubuntu-20.04 - - {image: "rstudio/r-base:4.1-focal"} # ubuntu-20.04 - - {image: "rstudio/r-base:4.2-centos7", devtoolset: "8"} + - {image: "rstudio/r-base:4.1-focal"} + - {image: "rstudio/r-base:4.2-jammy"} - {image: "rstudio/r-base:4.3-noble"} steps: # Get the arrow checkout just for the docker config scripts @@ -317,7 +317,6 @@ jobs: - name: Install system requirements env: ARROW_R_DEV: "TRUE" # To install curl/openssl in r_docker_configure.sh - DEVTOOLSET_VERSION: {{ '${{ matrix.config.devtoolset }}' }} shell: bash run: | # Make sure R is on the path for the R-hub devel versions (where RPREFIX is set in its dockerfile) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 52a235c688eda..d8e09ec2070bb 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -65,7 +65,7 @@ groups: - r-binary-packages - ubuntu-* - wheel-* - - test-ubuntu-*-docs + - test-debian-*-docs {############################# Testing tasks #################################} @@ -409,7 +409,7 @@ tasks: arrow_jemalloc: "ON" python_version: "{{ python_version }}" macos_deployment_target: "{{ macos_version }}" - runs_on: "macos-latest" + runs_on: "macos-13" vcpkg_arch: "amd64" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl @@ -451,8 +451,7 @@ tasks: {############################## Linux PKGS ####################################} -{% for target in ["debian-bullseye", - "debian-bookworm", +{% for target in ["debian-bookworm", "debian-trixie", "ubuntu-focal", "ubuntu-jammy", @@ -1359,7 +1358,7 @@ tasks: {% for r_org, r_image, r_tag in [("rhub", "ubuntu-release", "latest"), ("rocker", "r-ver", "latest"), ("rstudio", "r-base", "4.2-focal"), - ("rstudio", "r-base", "4.1-opensuse153")] %} + ("rstudio", "r-base", "4.1-opensuse155")] %} test-r-{{ r_org }}-{{ r_image }}-{{ r_tag }}: ci: azure template: r/azure.linux.yml @@ -1411,15 +1410,6 @@ tasks: GCC_VERSION: 12 image: ubuntu-r-only-r - test-r-rstudio-r-base-4.2-centos7-devtoolset-8: - ci: azure - template: r/azure.linux.yml - params: - r_org: rstudio - r_image: r-base - r_tag: 4.2-centos7 - devtoolset_version: 8 - test-r-minimal-build: ci: azure template: r/azure.linux.yml @@ -1437,13 +1427,13 @@ tasks: R_PRUNE_DEPS: TRUE image: ubuntu-r-sanitizer - test-fedora-r-clang-sanitizer: + test-r-clang-sanitizer: ci: github template: docker-tests/github.linux.yml params: env: R_PRUNE_DEPS: TRUE - image: fedora-r-clang-sanitizer + image: r-clang-sanitizer {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %} test-debian-12-go-{{ go_version }}: @@ -1458,15 +1448,15 @@ tasks: {% endfor %} # be sure to update binary-task.rb when upgrading ubuntu - test-ubuntu-22.04-docs: + test-debian-12-docs: ci: github template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: false artifacts: - docs.tar.gz @@ -1594,8 +1584,8 @@ tasks: template: docs/github.linux.yml params: env: - UBUNTU: 22.04 + JDK: 17 pr_number: Unset flags: "-v $PWD/build/:/build/" - image: ubuntu-docs + image: debian-docs publish: true diff --git a/dev/tasks/verify-rc/github.linux.amd64.docker.yml b/dev/tasks/verify-rc/github.linux.amd64.docker.yml index 65b30b5c8d4df..7a28ba705dd50 100644 --- a/dev/tasks/verify-rc/github.linux.amd64.docker.yml +++ b/dev/tasks/verify-rc/github.linux.amd64.docker.yml @@ -43,7 +43,7 @@ jobs: -e TEST_{{ target|upper }}=1 \ {{ distro }}-verify-rc - {% if arrow.is_default_branch() %} + {% if arrow.is_default_branch() and distro != "conda" %} {{ macros.github_login_dockerhub()|indent }} - name: Push Docker Image shell: bash diff --git a/docker-compose.yml b/docker-compose.yml index d771fc2d22a35..a1d8f60a268d8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -131,7 +131,8 @@ x-hierarchy: - debian-cpp: - debian-c-glib: - debian-ruby - - debian-python + - debian-python: + - debian-docs - debian-go: - debian-go-cgo - debian-go-cgo-python @@ -145,8 +146,7 @@ x-hierarchy: - ubuntu-c-glib: - ubuntu-ruby - ubuntu-lint - - ubuntu-python: - - ubuntu-docs + - ubuntu-python - ubuntu-python-sdist-test - ubuntu-r - ubuntu-r-only-r @@ -162,7 +162,7 @@ x-hierarchy: - ubuntu-r-valgrind - ubuntu-swift - ubuntu-verify-rc - - fedora-r-clang-sanitizer + - r-clang-sanitizer - r - r-revdepcheck # helper services @@ -1228,6 +1228,8 @@ services: # We should extend the list of enabled rules after adding this build to # the CI pipeline. image: ${REPO}:${ARCH}-conda-python-${PYTHON}-pandas-${PANDAS} + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_SUBSTRAIT: "ON" @@ -1378,7 +1380,7 @@ services: /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/java_jni_build.sh /arrow $${ARROW_HOME} /build /tmp/dist/java/ && /arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java && - /arrow/ci/scripts/java_cdata_integration.sh /arrow /tmp/dist/java" ] + /arrow/ci/scripts/java_cdata_integration.sh /arrow /build" ] conda-python-cython2: # Usage: @@ -1470,7 +1472,6 @@ services: args: base: ${R_ORG}/${R_IMAGE}:${R_TAG} r_dev: ${ARROW_R_DEV} - devtoolset_version: ${DEVTOOLSET_VERSION} tz: ${TZ} r_prune_deps: ${R_PRUNE_DEPS} r_custom_ccache: ${R_CUSTOM_CCACHE} @@ -1482,7 +1483,6 @@ services: ARROW_R_DEV: ${ARROW_R_DEV} # To test for CRAN release, delete ^^ these two env vars so we download the Apache release ARROW_USE_PKG_CONFIG: "false" - devtoolset_version: ${DEVTOOLSET_VERSION} volumes: - .:/arrow:delegated command: > @@ -1515,19 +1515,17 @@ services: /bin/bash -c " /arrow/ci/scripts/r_sanitize.sh /arrow" - fedora-r-clang-sanitizer: - image: ${REPO}:r-rhub-fedora-clang-devel-latest + r-clang-sanitizer: + image: ${REPO}:r-rhub-clang-devel-latest build: context: . dockerfile: ci/docker/linux-r.dockerfile cache_from: - - ${REPO}:r-rhub-fedora-clang-devel-latest + - ${REPO}:r-rhub-clang-devel-latest args: - # TODO: change this to rhub/clang-asan - base: rhub/fedora-clang-devel-san + base: rhub/clang-asan r_dev: ${ARROW_R_DEV} - devtoolset_version: ${DEVTOOLSET_VERSION} - r_bin: RDsan + r_bin: R tz: ${TZ} r_prune_deps: ${R_PRUNE_DEPS} shm_size: *shm-size @@ -1680,7 +1678,7 @@ services: command: &js-command > /bin/bash -c " /arrow/ci/scripts/js_build.sh /arrow /build && - /arrow/ci/scripts/js_test.sh /arrow" + /arrow/ci/scripts/js_test.sh /arrow /build" #################################### C# ##################################### @@ -1749,9 +1747,11 @@ services: volumes: *conda-volumes environment: <<: [*common, *ccache] + ARCHERY_INTEGRATION_WITH_NANOARROW: 0 ARCHERY_INTEGRATION_WITH_RUST: 0 # Tell Archery where Arrow binaries are located ARROW_CPP_EXE_PATH: /build/cpp/debug + ARROW_NANOARROW_PATH: /build/nanoarrow ARROW_RUST_EXE_PATH: /build/rust/debug command: ["/arrow/ci/scripts/integration_arrow_build.sh /arrow /build && @@ -1759,29 +1759,34 @@ services: ################################ Docs ####################################### - ubuntu-docs: + debian-docs: # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python - # docker-compose build ubuntu-docs - # docker-compose run --rm ubuntu-docs - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + # docker-compose build debian-cpp + # docker-compose build debian-python + # docker-compose build debian-docs + # docker-compose run --rm debian-docs + image: ${REPO}:${ARCH}-debian-${DEBIAN}-docs build: context: . dockerfile: ci/docker/linux-apt-docs.dockerfile cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-docs + - ${REPO}:${ARCH}-debian-${DEBIAN}-docs args: r: ${R} jdk: ${JDK} maven: ${MAVEN} node: ${NODE} - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 + base: ${REPO}:${ARCH}-debian-${DEBIAN}-python-3 + # This is for Chromium used by Mermaid. Chromium uses namespace + # isolation for security by default. + cap_add: + - SYS_ADMIN environment: <<: [*common, *ccache] ARROW_CUDA: "ON" ARROW_CXX_FLAGS_DEBUG: "-g1" ARROW_C_FLAGS_DEBUG: "-g1" + ARROW_HOME: "/tmp/local" ARROW_JAVA_SKIP_GIT_PLUGIN: ARROW_SUBSTRAIT: "ON" BUILD_DOCS_C_GLIB: "ON" @@ -1790,9 +1795,11 @@ services: BUILD_DOCS_JS: "ON" BUILD_DOCS_PYTHON: "ON" BUILD_DOCS_R: "ON" - volumes: *ubuntu-volumes - command: &docs-command > + volumes: *debian-volumes + command: > /bin/bash -c " + sudo mkdir -p /build /ccache && + sudo chown -R `id --user --name`: /build /ccache && /arrow/ci/scripts/cpp_build.sh /arrow /build && /arrow/ci/scripts/python_build.sh /arrow /build && /arrow/ci/scripts/c_glib_build.sh /arrow /build && diff --git a/docs/requirements.txt b/docs/requirements.txt index 252344a74a58f..afb252e17457b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,7 +8,9 @@ myst-parser[linkify] numpydoc pydata-sphinx-theme~=0.14 sphinx-autobuild -sphinx-design sphinx-copybutton +sphinx-design +sphinx-lint +sphinxcontrib-mermaid sphinx==6.2 pandas diff --git a/docs/source/_static/versions.json b/docs/source/_static/versions.json index f8ff19095b3fd..e879fc69138d0 100644 --- a/docs/source/_static/versions.json +++ b/docs/source/_static/versions.json @@ -5,11 +5,16 @@ "url": "https://arrow.apache.org/docs/dev/" }, { - "name": "16.0 (stable)", + "name": "16.1 (stable)", "version": "", "url": "https://arrow.apache.org/docs/", "preferred": true }, + { + "name": "16.0", + "version": "16.0/", + "url": "https://arrow.apache.org/docs/16.0/" + }, { "name": "15.0", "version": "15.0/", diff --git a/docs/source/conf.py b/docs/source/conf.py index 05340dc923c89..1e6c113e33188 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -125,6 +125,7 @@ 'sphinx.ext.intersphinx', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', + 'sphinxcontrib.mermaid', ] # Show members for classes in .. autosummary @@ -137,7 +138,9 @@ } # Breathe configuration -breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} +breathe_projects = { + "arrow_cpp": os.environ.get("ARROW_CPP_DOXYGEN_XML", "../../cpp/apidoc/xml"), +} breathe_default_project = "arrow_cpp" # Overridden conditionally below @@ -532,7 +535,7 @@ # # latex_appendices = [] -# It false, will not define \strong, \code, itleref, \crossref ... but only +# It false, will not define \strong, \code, \titleref, \crossref ... but only # \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added # packages. # @@ -584,6 +587,9 @@ # # texinfo_no_detailmenu = False +# -- Options for mermaid output ------------------------------------------- + +mermaid_output_format = 'svg' def setup(app): # Use a config value to indicate whether CUDA API docs can be generated. diff --git a/docs/source/cpp/acero/developer_guide.rst b/docs/source/cpp/acero/developer_guide.rst index 331cd833b58af..7dd08fe3ce2ce 100644 --- a/docs/source/cpp/acero/developer_guide.rst +++ b/docs/source/cpp/acero/developer_guide.rst @@ -187,7 +187,7 @@ Examples task (described below) as completed which allows the plan to finish. * The ``fetch`` node, in ``InputReceived``, may decide that it has all the data it needs. It can then call ``StopProducing`` on its input. - + Initialization / Construction / Destruction ------------------------------------------- @@ -271,7 +271,7 @@ distributed systems. Once that has been done then it should be possible to do a meaning exchanging between multiple exec plan instances on a single system) if desired. .. figure:: dist_plan.svg - + A distributed plan can provide parallelism even if the plans themselves run serially Pipeline Parallelism @@ -327,8 +327,8 @@ An engine could choose to create a thread task for every execution of a node. H this leads to problems with cache locality. For example, let's assume we have a basic plan consisting of three exec nodes, scan, project, and then filter (this is a very common use case). Now let's assume there are 100 batches. In a task-per-operator model we would have tasks like "Scan Batch 5", "Project Batch 5", and "Filter Batch 5". Each -of those tasks is potentially going to access the same data. For example, maybe the `project` and `filter` nodes need -to read the same column. A column which is intially created in a decode phase of the `scan` node. To maximize cache +of those tasks is potentially going to access the same data. For example, maybe the ``project`` and ``filter`` nodes need +to read the same column. A column which is intially created in a decode phase of the ``scan`` node. To maximize cache utilization we would need to carefully schedule our tasks to ensure that all three of those tasks are run consecutively and assigned to the same CPU core. @@ -412,7 +412,7 @@ Ordered Execution ================= Some nodes either establish an ordering to their outgoing batches or they need to be able to process batches in order. -Acero handles ordering using the `batch_index` property on an ExecBatch. If a node has a deterministic output order +Acero handles ordering using the ``batch_index`` property on an ExecBatch. If a node has a deterministic output order then it should apply a batch index on batches that it emits. For example, the OrderByNode applies a new ordering to batches (regardless of the incoming ordering). The scan node is able to attach an implicit ordering to batches which reflects the order of the rows in the files being scanned. @@ -461,8 +461,8 @@ Acero's tracing is currently half-implemented and there are major gaps in profil effort at tracing with open telemetry and most of the necessary pieces are in place. The main thing currently lacking is some kind of effective visualization of the tracing results. -In order to use the tracing that is present today you will need to build with Arrow with `ARROW_WITH_OPENTELEMETRY=ON`. -Then you will need to set the environment variable `ARROW_TRACING_BACKEND=otlp_http`. This will configure open telemetry +In order to use the tracing that is present today you will need to build with Arrow with ``ARROW_WITH_OPENTELEMETRY=ON``. +Then you will need to set the environment variable ``ARROW_TRACING_BACKEND=otlp_http``. This will configure open telemetry to export trace results (as OTLP) to the HTTP endpoint http://localhost:4318/v1/traces. You will need to configure an open telemetry collector to collect results on that endpoint and you will need to configure a trace viewer of some kind such as Jaeger: https://www.jaegertracing.io/docs/1.21/opentelemetry/ @@ -472,7 +472,7 @@ Benchmarking The most complete macro benchmarking for Acero is provided by https://github.com/voltrondata-labs/arrowbench These include a set of TPC-H benchmarks, executed from the R-dplyr integration, which are run on every Arrow commit and -reported to Conbench at https://conbench.ursa.dev/ +reported to Conbench at https://conbench.ursa.dev/ In addition to these TPC-H benchmarks there are a number of micro-benchmarks for various nodes (hash-join, asof-join, etc.) Finally, the compute functions themselves should mostly have micro-benchmarks. For more on micro benchmarks you diff --git a/docs/source/cpp/acero/overview.rst b/docs/source/cpp/acero/overview.rst index c569f82b099b6..34e0b143bc2ce 100644 --- a/docs/source/cpp/acero/overview.rst +++ b/docs/source/cpp/acero/overview.rst @@ -206,19 +206,19 @@ is very similar to a RecordBatch. It can have zero or more columns and all of t must have the same length. There are a few key differences from ExecBatch: .. figure:: rb_vs_eb.svg - + Both the record batch and the exec batch have strong ownership of the arrays & buffers -* An `ExecBatch` does not have a schema. This is because an `ExecBatch` is assumed to be +* An ``ExecBatch`` does not have a schema. This is because an ``ExecBatch`` is assumed to be part of a stream of batches and the stream is assumed to have a consistent schema. So - the schema for an `ExecBatch` is typically stored in the ExecNode. -* Columns in an `ExecBatch` are either an `Array` or a `Scalar`. When a column is a `Scalar` - this means that the column has a single value for every row in the batch. An `ExecBatch` + the schema for an ``ExecBatch`` is typically stored in the ExecNode. +* Columns in an ``ExecBatch`` are either an ``Array`` or a ``Scalar``. When a column is a ``Scalar`` + this means that the column has a single value for every row in the batch. An ``ExecBatch`` also has a length property which describes how many rows are in a batch. So another way to - view a `Scalar` is a constant array with `length` elements. -* An `ExecBatch` contains additional information used by the exec plan. For example, an - `index` can be used to describe a batch's position in an ordered stream. We expect - that `ExecBatch` will also evolve to contain additional fields such as a selection vector. + view a ``Scalar`` is a constant array with ``length`` elements. +* An ``ExecBatch`` contains additional information used by the exec plan. For example, an + ``index`` can be used to describe a batch's position in an ordered stream. We expect + that ``ExecBatch`` will also evolve to contain additional fields such as a selection vector. .. figure:: scalar_vs_array.svg @@ -231,8 +231,8 @@ only zero copy if there are no scalars in the exec batch. .. note:: Both Acero and the compute module have "lightweight" versions of batches and arrays. - In the compute module these are called `BatchSpan`, `ArraySpan`, and `BufferSpan`. In - Acero the concept is called `KeyColumnArray`. These types were developed concurrently + In the compute module these are called ``BatchSpan``, ``ArraySpan``, and ``BufferSpan``. In + Acero the concept is called ``KeyColumnArray``. These types were developed concurrently and serve the same purpose. They aim to provide an array container that can be completely stack allocated (provided the data type is non-nested) in order to avoid heap allocation overhead. Ideally these two concepts will be merged someday. @@ -247,9 +247,9 @@ execution of the nodes. Both ExecPlan and ExecNode are tied to the lifecycle of They have state and are not expected to be restartable. .. warning:: - The structures within Acero, including `ExecBatch`, are still experimental. The `ExecBatch` - class should not be used outside of Acero. Instead, an `ExecBatch` should be converted to - a more standard structure such as a `RecordBatch`. + The structures within Acero, including ``ExecBatch``, are still experimental. The ``ExecBatch`` + class should not be used outside of Acero. Instead, an ``ExecBatch`` should be converted to + a more standard structure such as a ``RecordBatch``. Similarly, an ExecPlan is an internal concept. Users creating plans should be using Declaration objects. APIs for consuming and executing plans should abstract away the details of the underlying @@ -266,5 +266,5 @@ various query representations (e.g. Substrait). The Declaration objects are the with the DeclarationToXyz methods, are the current public API for Acero. .. figure:: decl_vs_ep.svg - - A declaration is a blueprint that is used to instantiate exec plan instances \ No newline at end of file + + A declaration is a blueprint that is used to instantiate exec plan instances diff --git a/docs/source/cpp/acero/substrait.rst b/docs/source/cpp/acero/substrait.rst index 797b2407f93cd..a5532733627c1 100644 --- a/docs/source/cpp/acero/substrait.rst +++ b/docs/source/cpp/acero/substrait.rst @@ -111,7 +111,7 @@ Aggregate Relations * Each measure's arguments must be direct references. * A measure may not have a filter * A measure may not have sorts -* A measure's invocation must be AGGREGATION_INVOCATION_ALL or +* A measure's invocation must be AGGREGATION_INVOCATION_ALL or AGGREGATION_INVOCATION_UNSPECIFIED * A measure's phase must be AGGREGATION_PHASE_INITIAL_TO_RESULT @@ -146,73 +146,73 @@ Types - Caveat * - boolean - boolean - - + - * - i8 - int8 - - + - * - i16 - int16 - - + - * - i32 - int32 - - + - * - i64 - int64 - - + - * - fp32 - float32 - - + - * - fp64 - float64 - - + - * - string - string - - + - * - binary - binary - - + - * - timestamp - timestamp - - + - * - timestamp_tz - timestamp - - + - * - date - date32 - - + - * - time - time64 - - + - * - interval_year - - + - - Not currently supported * - interval_day - - + - - Not currently supported * - uuid - - + - - Not currently supported * - FIXEDCHAR - - + - - Not currently supported * - VARCHAR - - + - - Not currently supported * - FIXEDBINARY - fixed_size_binary - - + - * - DECIMAL - decimal128 - - + - * - STRUCT - struct - Arrow struct fields will have no name (empty string) * - NSTRUCT - - + - - Not currently supported * - LIST - list - - + - * - MAP - map - K must not be nullable diff --git a/docs/source/cpp/acero/user_guide.rst b/docs/source/cpp/acero/user_guide.rst index eca1a0104708b..0271be2180e99 100644 --- a/docs/source/cpp/acero/user_guide.rst +++ b/docs/source/cpp/acero/user_guide.rst @@ -32,14 +32,14 @@ Using Acero The basic workflow for Acero is this: #. First, create a graph of :class:`Declaration` objects describing the plan - + #. Call one of the DeclarationToXyz methods to execute the Declaration. a. A new ExecPlan is created from the graph of Declarations. Each Declaration will correspond to one ExecNode in the plan. In addition, a sink node will be added, depending on which DeclarationToXyz method was used. - b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in + b. The ExecPlan is executed. Typically this happens as part of the DeclarationToXyz call but in DeclarationToReader the reader is returned before the plan is finished executing. c. Once the plan is finished it is destroyed @@ -315,7 +315,7 @@ of a specific execution node. ``source`` ---------- -A ``source`` operation can be considered as an entry point to create a streaming execution plan. +A ``source`` operation can be considered as an entry point to create a streaming execution plan. :class:`SourceNodeOptions` are used to create the ``source`` operation. The ``source`` operation is the most generic and flexible type of source currently available but it can be quite tricky to configure. First you should review the other source node types to ensure there @@ -326,7 +326,7 @@ function should take no arguments and should return an ``arrow::Future>``. This function might be reading a file, iterating through an in memory structure, or receiving data from a network connection. The arrow library refers to these functions as ``arrow::AsyncGenerator`` -and there are a number of utilities for working with these functions. For this example we use +and there are a number of utilities for working with these functions. For this example we use a vector of record batches that we've already stored in memory. In addition, the schema of the data must be known up front. Acero must know the schema of the data at each stage of the execution graph before any processing has begun. This means we must supply the @@ -368,10 +368,10 @@ Example of using ``source`` (usage of sink is explained in detail in :ref:`sink< In the previous example, :ref:`source node `, a source node was used to input the data. But when developing an application, if the data is already in memory as a table, it is much easier, and more performant to use :class:`TableSourceNodeOptions`. -Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. +Here the input data can be passed as a ``std::shared_ptr`` along with a ``max_batch_size``. The ``max_batch_size`` is to break up large record batches so that they can be processed in parallel. It is important to note that the table batches will not get merged to form larger batches when the source -table has a smaller batch size. +table has a smaller batch size. Example of using ``table_source`` @@ -387,7 +387,7 @@ Example of using ``table_source`` ``filter`` ---------- -``filter`` operation, as the name suggests, provides an option to define data filtering +``filter`` operation, as the name suggests, provides an option to define data filtering criteria. It selects rows where the given expression evaluates to true. Filters can be written using :class:`arrow::compute::Expression`, and the expression should have a return type of boolean. For example, if we wish to keep rows where the value @@ -415,7 +415,7 @@ functions, i.e. elementwise functions that return one value for each input row independent of the value of all other rows). This is exposed via :class:`ProjectNodeOptions` which requires, an :class:`arrow::compute::Expression` and name for each of the output columns (if names are not -provided, the string representations of exprs will be used). +provided, the string representations of exprs will be used). Project example: @@ -455,8 +455,8 @@ can be selected from :ref:`this list of aggregation functions will be added which should alleviate this constraint. The aggregation can provide results as a group or scalar. For instances, -an operation like `hash_count` provides the counts per each unique record -as a grouped result while an operation like `sum` provides a single record. +an operation like ``hash_count`` provides the counts per each unique record +as a grouped result while an operation like ``sum`` provides a single record. Scalar Aggregation example: @@ -481,16 +481,16 @@ Group Aggregation example: ``sink`` -------- -``sink`` operation provides output and is the final node of a streaming -execution definition. :class:`SinkNodeOptions` interface is used to pass +``sink`` operation provides output and is the final node of a streaming +execution definition. :class:`SinkNodeOptions` interface is used to pass the required options. Similar to the source operator the sink operator exposes the output with a function that returns a record batch future each time it is called. It is expected the caller will repeatedly call this function until the generator function is exhausted (returns ``std::optional::nullopt``). If this function is not called often enough then record batches will accumulate in memory. An execution plan should only have one -"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or +"terminal" node (one sink node). An :class:`ExecPlan` can terminate early due to cancellation or an error, before the output is fully consumed. However, the plan can be safely destroyed independently -of the sink, which will hold the unconsumed batches by `exec_plan->finished()`. +of the sink, which will hold the unconsumed batches by ``exec_plan->finished()``. As a part of the Source Example, the Sink operation is also included; @@ -515,7 +515,7 @@ The consuming function may be called before a previous invocation has completed. function does not run quickly enough then many concurrent executions could pile up, blocking the CPU thread pool. The execution plan will not be marked finished until all consuming function callbacks have been completed. -Once all batches have been delivered the execution plan will wait for the `finish` future to complete +Once all batches have been delivered the execution plan will wait for the ``finish`` future to complete before marking the execution plan finished. This allows for workflows where the consumption function converts batches into async tasks (this is currently done internally for the dataset write node). @@ -526,12 +526,12 @@ Example:: arrow::Future<> finish = arrow::Future<>::Make(); struct CustomSinkNodeConsumer : public cp::SinkNodeConsumer { - CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): + CustomSinkNodeConsumer(std::atomic *batches_seen, arrow::Future<>finish): batches_seen(batches_seen), finish(std::move(finish)) {} // Consumption logic can be written here arrow::Status Consume(cp::ExecBatch batch) override { // data can be consumed in the expected way - // transfer to another system or just do some work + // transfer to another system or just do some work // and write to disk (*batches_seen)++; return arrow::Status::OK(); @@ -541,9 +541,9 @@ Example:: std::atomic *batches_seen; arrow::Future<> finish; - + }; - + std::shared_ptr consumer = std::make_shared(&batches_seen, finish); @@ -567,14 +567,14 @@ Consuming-Sink example: ``order_by_sink`` ----------------- -``order_by_sink`` operation is an extension to the ``sink`` operation. -This operation provides the ability to guarantee the ordering of the -stream by providing the :class:`OrderBySinkNodeOptions`. -Here the :class:`arrow::compute::SortOptions` are provided to define which columns +``order_by_sink`` operation is an extension to the ``sink`` operation. +This operation provides the ability to guarantee the ordering of the +stream by providing the :class:`OrderBySinkNodeOptions`. +Here the :class:`arrow::compute::SortOptions` are provided to define which columns are used for sorting and whether to sort by ascending or descending values. .. note:: This node is a "pipeline breaker" and will fully materialize the dataset in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. @@ -593,14 +593,14 @@ Order-By-Sink example: ``select_k_sink`` ----------------- -``select_k_sink`` option enables selecting the top/bottom K elements, -similar to a SQL ``ORDER BY ... LIMIT K`` clause. -:class:`SelectKOptions` which is a defined by -using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives +``select_k_sink`` option enables selecting the top/bottom K elements, +similar to a SQL ``ORDER BY ... LIMIT K`` clause. +:class:`SelectKOptions` which is a defined by +using :struct:`OrderBySinkNode` definition. This option returns a sink node that receives inputs and then compute top_k/bottom_k. .. note:: This node is a "pipeline breaker" and will fully materialize the input in memory. - In the future, spillover mechanisms will be added which should alleviate this + In the future, spillover mechanisms will be added which should alleviate this constraint. SelectK example: @@ -617,7 +617,7 @@ SelectK example: .. _stream_execution_table_sink_docs: -The ``table_sink`` node provides the ability to receive the output as an in-memory table. +The ``table_sink`` node provides the ability to receive the output as an in-memory table. This is simpler to use than the other sink nodes provided by the streaming execution engine but it only makes sense when the output fits comfortably in memory. The node is created using :class:`TableSinkNodeOptions`. @@ -637,7 +637,7 @@ Example of using ``table_sink`` --------- ``scan`` is an operation used to load and process datasets. It should be preferred over the -more generic ``source`` node when your input is a dataset. The behavior is defined using +more generic ``source`` node when your input is a dataset. The behavior is defined using :class:`arrow::dataset::ScanNodeOptions`. More information on datasets and the various scan options can be found in :doc:`../dataset`. @@ -683,10 +683,10 @@ Write example: ``union`` ------------- -``union`` merges multiple data streams with the same schema into one, similar to +``union`` merges multiple data streams with the same schema into one, similar to a SQL ``UNION ALL`` clause. -The following example demonstrates how this can be achieved using +The following example demonstrates how this can be achieved using two data sources. Union example: @@ -704,15 +704,15 @@ Union example: ------------- ``hash_join`` operation provides the relational algebra operation, join using hash-based -algorithm. :class:`HashJoinNodeOptions` contains the options required in -defining a join. The hash_join supports +algorithm. :class:`HashJoinNodeOptions` contains the options required in +defining a join. The hash_join supports `left/right/full semi/anti/outerjoins -`_. +`_. Also the join-key (i.e. the column(s) to join on), and suffixes (i.e a suffix term like "_x" -which can be appended as a suffix for column names duplicated in both left and right -relations.) can be set via the join options. +which can be appended as a suffix for column names duplicated in both left and right +relations.) can be set via the join options. `Read more on hash-joins -`_. +`_. Hash-Join example: @@ -726,7 +726,7 @@ Hash-Join example: Summary ======= -There are examples of these nodes which can be found in +There are examples of these nodes which can be found in ``cpp/examples/arrow/execution_plan_documentation_examples.cc`` in the Arrow source. Complete Example: diff --git a/docs/source/cpp/api/filesystem.rst b/docs/source/cpp/api/filesystem.rst index 02b12668327f2..599e9fedb60f9 100644 --- a/docs/source/cpp/api/filesystem.rst +++ b/docs/source/cpp/api/filesystem.rst @@ -97,3 +97,12 @@ Google Cloud Storage filesystem .. doxygenclass:: arrow::fs::GcsFileSystem :members: + +Azure filesystem +---------------- + +.. doxygenstruct:: arrow::fs::AzureOptions + :members: + +.. doxygenclass:: arrow::fs::AzureFileSystem + :members: diff --git a/docs/source/cpp/api/flightsql.rst b/docs/source/cpp/api/flightsql.rst index 565b605108d9f..0f49a76f20687 100644 --- a/docs/source/cpp/api/flightsql.rst +++ b/docs/source/cpp/api/flightsql.rst @@ -22,8 +22,6 @@ Arrow Flight SQL ================ -.. note:: Flight SQL is currently experimental and APIs are subject to change. - Common Types ============ diff --git a/docs/source/cpp/api/scalar.rst b/docs/source/cpp/api/scalar.rst index 04e78450d7744..be9f9686bf110 100644 --- a/docs/source/cpp/api/scalar.rst +++ b/docs/source/cpp/api/scalar.rst @@ -44,4 +44,4 @@ Utilities .. doxygenclass:: arrow::ScalarVisitor :project: arrow_cpp :members: - :undoc-members: \ No newline at end of file + :undoc-members: diff --git a/docs/source/cpp/build_system.rst b/docs/source/cpp/build_system.rst index 60df117eb510e..e80bca4c949dc 100644 --- a/docs/source/cpp/build_system.rst +++ b/docs/source/cpp/build_system.rst @@ -47,7 +47,7 @@ file into an executable linked with the Arrow C++ shared library: .. code-block:: cmake cmake_minimum_required(VERSION 3.16) - + project(MyExample) find_package(Arrow REQUIRED) @@ -167,7 +167,7 @@ file into an executable linked with the Arrow C++ shared library: .. code-block:: makefile my_example: my_example.cc - $(CXX) -o $@ $(CXXFLAGS) $< $$(pkg-config --cflags --libs arrow) + $(CXX) -o $@ $(CXXFLAGS) $< $$(pkg-config --cflags --libs arrow) Many build systems support pkg-config. For example: diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index e7310d2c0c711..701c7d573ac0e 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -49,8 +49,8 @@ Computation inputs are represented as a general :class:`Datum` class, which is a tagged union of several shapes of data such as :class:`Scalar`, :class:`Array` and :class:`ChunkedArray`. Many compute functions support both array (chunked or not) and scalar inputs, however some will mandate -particular input types. For example, while ``array_sort_indices`` requires its -first and only input to be an array, the generalized ``sort_indices`` +particular input types. For example, while ``array_sort_indices`` requires its +first and only input to be an array, the generalized ``sort_indices`` function accepts an array, chunked array, record batch or table. .. _invoking-compute-functions: @@ -514,8 +514,8 @@ Mixed time resolution temporal inputs will be cast to finest input resolution. +------------+---------------------------------------------+ It's compatible with Redshift's decimal promotion rules. All decimal digits - are preserved for `add`, `subtract` and `multiply` operations. The result - precision of `divide` is at least the sum of precisions of both operands with + are preserved for ``add``, ``subtract`` and ``multiply`` operations. The result + precision of ``divide`` is at least the sum of precisions of both operands with enough scale kept. Error is returned if the result precision is beyond the decimal value range. @@ -572,28 +572,28 @@ representation based on the rounding criterion. | trunc | Unary | Numeric | Float32/Float64/Decimal | | | +-------------------+------------+-------------+-------------------------+----------------------------------+--------+ -* \(1) By default rounding functions change a value to the nearest - integer using HALF_TO_EVEN to resolve ties. Options are available to control - the rounding criterion. All ``round`` functions have the +* \(1) By default rounding functions change a value to the nearest + integer using HALF_TO_EVEN to resolve ties. Options are available to control + the rounding criterion. All ``round`` functions have the ``round_mode`` option to set the rounding mode. * \(2) Round to a number of digits where the ``ndigits`` option of :struct:`RoundOptions` specifies the rounding precision in terms of number of digits. A negative value corresponds to digits in the non-fractional part. For example, -2 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0 - which rounds to the nearest integer. For integer inputs a non-negative + which rounds to the nearest integer. For integer inputs a non-negative ``ndigits`` value is ignored and the input is returned unchanged. For integer - inputs, if ``-ndigits`` is larger than the maximum number of digits the + inputs, if ``-ndigits`` is larger than the maximum number of digits the input type can hold, an error is returned. * \(3) Round to a multiple where the ``multiple`` option of :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding - multiple has to be a positive value and can be casted to input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 - (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which + multiple has to be a positive value and can be casted to input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 + (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which rounds to the nearest integer. * \(4) Round the first input to multiple of the second input. The rounding - multiple has to be a positive value and can be casted to the first input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 + multiple has to be a positive value and can be casted to the first input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). For ``round`` functions, the following rounding modes are available. @@ -634,8 +634,8 @@ The example values are given for default values of ``ndigits`` and ``multiple``. | | | -3.5 -> -3, -4.5 -> -5 | +-----------------------+--------------------------------------------------------------+---------------------------+ -The following table gives examples of how ``ndigits`` (for the ``round`` -and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) +The following table gives examples of how ``ndigits`` (for the ``round`` +and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) influence the operation performed, respectively. +--------------------+-------------------+---------------------------+ @@ -1029,7 +1029,7 @@ These functions trim off characters on both sides (trim), or the left (ltrim) or +--------------------------+------------+-------------------------+---------------------+----------------------------------------+---------+ * \(1) Only characters specified in :member:`TrimOptions::characters` will be - trimmed off. Both the input string and the `characters` argument are + trimmed off. Both the input string and the ``characters`` argument are interpreted as ASCII characters. * \(2) Only trim off ASCII whitespace characters (``'\t'``, ``'\n'``, ``'\v'``, @@ -1570,7 +1570,7 @@ is the same, even though the UTC years would be different. Timezone handling ~~~~~~~~~~~~~~~~~ -`assume_timezone` function is meant to be used when an external system produces +``assume_timezone`` function is meant to be used when an external system produces "timezone-naive" timestamps which need to be converted to "timezone-aware" timestamps (see for example the `definition `__ @@ -1581,11 +1581,11 @@ Input timestamps are assumed to be relative to the timezone given in UTC-relative timestamps with the timezone metadata set to the above value. An error is returned if the timestamps already have the timezone metadata set. -`local_timestamp` function converts UTC-relative timestamps to local "timezone-naive" +``local_timestamp`` function converts UTC-relative timestamps to local "timezone-naive" timestamps. The timezone is taken from the timezone metadata of the input -timestamps. This function is the inverse of `assume_timezone`. Please note: +timestamps. This function is the inverse of ``assume_timezone``. Please note: **all temporal functions already operate on timestamps as if they were in local -time of the metadata provided timezone**. Using `local_timestamp` is only meant to be +time of the metadata provided timezone**. Using ``local_timestamp`` is only meant to be used when an external system expects local timestamps. +-----------------+-------+-------------+---------------+---------------------------------+-------+ @@ -1621,12 +1621,12 @@ Array-wise ("vector") functions Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions -do not detect overflow. They are also available in an overflow-checking variant, -suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are also available in an overflow-checking variant, +suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ @@ -1649,8 +1649,8 @@ overflow is detected. * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running - accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of - input type for `max`, and max of input type for `min`. Specified values of + accumulation. It has a default value of 0 for ``sum``, 1 for ``prod``, min of + input type for ``max``, and max of input type for ``min``. Specified values of ``start`` must be castable to the input type. The second parameter :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to @@ -1861,9 +1861,9 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ -Pairwise functions are unary vector functions that perform a binary operation on +Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th -output is computed by applying the binary operation to the n-th and (n-p)-th inputs, +output is computed by applying the binary operation to the n-th and (n-p)-th inputs, where p is the period. The default period is 1, in which case the binary operation is applied to adjacent pairs of inputs. The period can also be negative, in which case the n-th output is computed by applying the binary @@ -1877,9 +1877,9 @@ operation to the n-th and (n+abs(p))-th inputs. | pairwise_diff_checked | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(3) | +------------------------+-------+----------------------+----------------------+--------------------------------+----------+ -* \(1) Computes the first order difference of an array, It internally calls - the scalar function ``Subtract`` (or the checked variant) to compute - differences, so its behavior and supported types are the same as - ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. +* \(1) Computes the first order difference of an array, It internally calls + the scalar function ``Subtract`` (or the checked variant) to compute + differences, so its behavior and supported types are the same as + ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. * \(2) Wraps around the result when overflow is detected. * \(3) Returns an ``Invalid`` :class:`Status` when overflow is detected. diff --git a/docs/source/cpp/dataset.rst b/docs/source/cpp/dataset.rst index 1f5d0476c2889..a64b73b61c05d 100644 --- a/docs/source/cpp/dataset.rst +++ b/docs/source/cpp/dataset.rst @@ -378,28 +378,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: diff --git a/docs/source/cpp/datatypes.rst b/docs/source/cpp/datatypes.rst index 4e1fe76b4d6f2..7eb70936f4e1d 100644 --- a/docs/source/cpp/datatypes.rst +++ b/docs/source/cpp/datatypes.rst @@ -72,8 +72,8 @@ To instantiate data types, it is recommended to call the provided Type Traits ----------- -Writing code that can handle concrete :class:`arrow::DataType` subclasses would -be verbose, if it weren't for type traits. Arrow's type traits map the Arrow +Writing code that can handle concrete :class:`arrow::DataType` subclasses would +be verbose, if it weren't for type traits. Arrow's type traits map the Arrow data types to the specialized array, scalar, builder, and other associated types. For example, the Boolean type has traits: @@ -96,7 +96,7 @@ For example, the Boolean type has traits: See the :ref:`type-traits` for an explanation of each of these fields. Using type traits, one can write template functions that can handle a variety -of Arrow types. For example, to write a function that creates an array of +of Arrow types. For example, to write a function that creates an array of Fibonacci values for any Arrow numeric type: .. code-block:: cpp @@ -128,7 +128,7 @@ For some common cases, there are type associations on the classes themselves. Us Similar to the type traits provided in `std::type_traits `_, -Arrow provides type predicates such as ``is_number_type`` as well as +Arrow provides type predicates such as ``is_number_type`` as well as corresponding templates that wrap ``std::enable_if_t`` such as ``enable_if_number``. These can constrain template functions to only compile for relevant types, which is useful if other overloads need to be implemented. For example, to write a sum @@ -176,20 +176,20 @@ here is how one might sum across columns of arbitrary numeric types: class TableSummation { double partial = 0.0; public: - + arrow::Result Compute(std::shared_ptr batch) { for (std::shared_ptr array : batch->columns()) { ARROW_RETURN_NOT_OK(arrow::VisitArrayInline(*array, this)); } return partial; } - + // Default implementation arrow::Status Visit(const arrow::Array& array) { return arrow::Status::NotImplemented("Cannot compute sum for array of type ", array.type()->ToString()); } - + template arrow::enable_if_number Visit(const ArrayType& array) { for (std::optional value : array) { diff --git a/docs/source/cpp/env_vars.rst b/docs/source/cpp/env_vars.rst index 116c151824c75..0a082b0a5d859 100644 --- a/docs/source/cpp/env_vars.rst +++ b/docs/source/cpp/env_vars.rst @@ -181,6 +181,10 @@ that changing their value later will have an effect. The number of entries to keep in the Gandiva JIT compilation cache. The cache is in-memory and does not persist across processes. + The default cache size is 5000. The value of this environment variable + should be a positive integer and should not exceed the maximum value + of int32. Otherwise the default value is used. + .. envvar:: HADOOP_HOME The path to the Hadoop installation. diff --git a/docs/source/cpp/examples/compute_and_write_example.rst b/docs/source/cpp/examples/compute_and_write_example.rst index e66d3ced55d0c..a4b619f7ffff3 100644 --- a/docs/source/cpp/examples/compute_and_write_example.rst +++ b/docs/source/cpp/examples/compute_and_write_example.rst @@ -21,8 +21,8 @@ Compute and Write CSV Example ============================= -The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside -the source tree contains an example of creating a table of two numerical columns -and then comparing the magnitudes of the entries in the columns and writing out to +The file ``cpp/examples/arrow/compute_and_write_csv_example.cc`` located inside +the source tree contains an example of creating a table of two numerical columns +and then comparing the magnitudes of the entries in the columns and writing out to a CSV file with the column entries and their comparisons. The code in the example is documented. diff --git a/docs/source/cpp/flight.rst b/docs/source/cpp/flight.rst index e07a84e91ee4f..a1e9420bfd34e 100644 --- a/docs/source/cpp/flight.rst +++ b/docs/source/cpp/flight.rst @@ -350,10 +350,10 @@ Closing unresponsive connections calls Cancel() on a timer, with the main thread resetting the timer every time an operation completes successfully. For a fully-worked out example, see the Cookbook. - + .. note:: There is a long standing ticket for a per-write/per-read timeout instead of a per call timeout (ARROW-6062_), but this is not (easily) - possible to implement with the blocking gRPC API. + possible to implement with the blocking gRPC API. .. _best gRPC practices: https://grpc.io/docs/guides/performance/#general .. _gRPC keys: https://grpc.github.io/grpc/cpp/group__grpc__arg__keys.html diff --git a/docs/source/cpp/gandiva.rst b/docs/source/cpp/gandiva.rst index 07b07bee7ac4e..f60d1fc8ac8d9 100644 --- a/docs/source/cpp/gandiva.rst +++ b/docs/source/cpp/gandiva.rst @@ -29,8 +29,8 @@ Gandiva only handles projections and filters; for other transformations, see :ref:`Compute Functions `. Gandiva was designed to take advantage of the Arrow memory format and modern -hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and -validity bitmaps, values and their null status can often be processed +hardware. From the Arrow memory model, since Arrow arrays have separate buffers for values and +validity bitmaps, values and their null status can often be processed independently, allowing for better instruction pipelining. On modern hardware, compiling expressions using LLVM allows the execution to be optimized to the local runtime environment and hardware, including available SIMD @@ -42,25 +42,25 @@ pre-compiled into LLVM IR (intermediate representation). Expression, Projector and Filter ================================ -To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, -including the creation of function nodes, if-else logic, and boolean expressions. +To effectively utilize Gandiva, you will construct expression trees with ``TreeExprBuilder``, +including the creation of function nodes, if-else logic, and boolean expressions. Subsequently, leverage ``Projector`` or ``Filter`` execution kernels to efficiently evaluate these expressions. -See :doc:`./gandiva/expr_projector_filter` for more details. +See :doc:`./gandiva/expr_projector_filter` for more details. External Functions Development ============================== -Gandiva offers the capability of integrating external functions, encompassing -both C functions and IR functions. This feature broadens the spectrum of -functions that can be applied within Gandiva expressions. For developers -looking to customize and enhance their computational solutions, -Gandiva provides the opportunity to develop and register their own external -functions, thus allowing for a more tailored and flexible use of the Gandiva +Gandiva offers the capability of integrating external functions, encompassing +both C functions and IR functions. This feature broadens the spectrum of +functions that can be applied within Gandiva expressions. For developers +looking to customize and enhance their computational solutions, +Gandiva provides the opportunity to develop and register their own external +functions, thus allowing for a more tailored and flexible use of the Gandiva environment. -See :doc:`./gandiva/external_func` for more details. +See :doc:`./gandiva/external_func` for more details. .. toctree:: :maxdepth: 2 gandiva/expr_projector_filter - gandiva/external_func \ No newline at end of file + gandiva/external_func diff --git a/docs/source/cpp/gandiva/expr_projector_filter.rst b/docs/source/cpp/gandiva/expr_projector_filter.rst index c960d1d869fe5..9d58b185032e3 100644 --- a/docs/source/cpp/gandiva/expr_projector_filter.rst +++ b/docs/source/cpp/gandiva/expr_projector_filter.rst @@ -30,7 +30,7 @@ literal values, created by :func:`TreeExprBuilder::MakeLiteral`. Nodes can be combined into more complex expression trees using: * :func:`TreeExprBuilder::MakeFunction` to create a function - node. (You can call :func:`GetRegisteredFunctionSignatures` to + node. (You can call :func:`GetRegisteredFunctionSignatures` to get a list of valid function signatures.) * :func:`TreeExprBuilder::MakeIf` to create if-else logic. * :func:`TreeExprBuilder::MakeAnd` and :func:`TreeExprBuilder::MakeOr` @@ -39,7 +39,7 @@ can be combined into more complex expression trees using: functions to create set membership tests. Each of these functions create new composite nodes, which contain the leaf nodes -(literals and field references) or other composite nodes as children. By +(literals and field references) or other composite nodes as children. By composing these, you can create arbitrarily complex expression trees. Once an expression tree is built, they are wrapped in either :class:`Expression` @@ -84,7 +84,7 @@ reused to process distinct record batches in parallel. Evaluating projections ---------------------- -Execution is performed with :func:`Projector::Evaluate`. This outputs +Execution is performed with :func:`Projector::Evaluate`. This outputs a vector of arrays, which can be passed along with the output schema to :func:`arrow::RecordBatch::Make()`. @@ -99,14 +99,14 @@ Evaluating filters :func:`Filter::Evaluate` produces :class:`SelectionVector`, a vector of row indices that matched the filter condition. The selection vector -is a wrapper around an arrow integer array, parameterized by bitwidth. When -creating the selection vector (you must initialize it *before* passing to -``Evaluate()``), you must choose the bitwidth, which determines the max index +is a wrapper around an arrow integer array, parameterized by bitwidth. When +creating the selection vector (you must initialize it *before* passing to +``Evaluate()``), you must choose the bitwidth, which determines the max index value it can hold, and the max number of slots, which determines how many indices -it may contain. In general, the max number of slots should be set to your batch -size and the bitwidth the smallest integer size that can represent all integers -less than the batch size. For example, if your batch size is 100k, set the -maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which +it may contain. In general, the max number of slots should be set to your batch +size and the bitwidth the smallest integer size that can represent all integers +less than the batch size. For example, if your batch size is 100k, set the +maximum number of slots to 100k and the bitwidth to 32 (since 2^16 = 64k which would be too small). Once ``Evaluate()`` has been run and the :class:`SelectionVector` is @@ -123,10 +123,10 @@ output record batch. Evaluating projections and filters ---------------------------------- -Finally, you can also project while apply a selection vector, with +Finally, you can also project while apply a selection vector, with :func:`Projector::Evaluate()`. To do so, first make sure to initialize the :class:`Projector` with :func:`SelectionVector::GetMode()` so that the projector -compiles with the correct bitwidth. Then you can pass the +compiles with the correct bitwidth. Then you can pass the :class:`SelectionVector` into the :func:`Projector::Evaluate()` method. @@ -134,4 +134,4 @@ compiles with the correct bitwidth. Then you can pass the :language: cpp :start-after: (Doc section: Evaluate filter and projection) :end-before: (Doc section: Evaluate filter and projection) - :dedent: 2 \ No newline at end of file + :dedent: 2 diff --git a/docs/source/cpp/gandiva/external_func.rst b/docs/source/cpp/gandiva/external_func.rst index cdd8fc82e59db..f8bdde83d96e6 100644 --- a/docs/source/cpp/gandiva/external_func.rst +++ b/docs/source/cpp/gandiva/external_func.rst @@ -79,7 +79,7 @@ The ``NativeFunction`` class is used to define the metadata for an external func * ``ResultNullableType::kResultNullIfNull``: result validity is an intersection of the validity of the children. * ``ResultNullableType::kResultNullNever``: result is always valid. * ``ResultNullableType::kResultNullInternal``: result validity depends on some internal logic. -* ``pc_name``: The name of the corresponding precompiled function. +* ``pc_name``: The name of the corresponding precompiled function. * Typically, this name follows the convention ``{base_name}`` + ``_{param1_type}`` + ``{param2_type}`` + ... + ``{paramN_type}``. For example, if the base name is ``add`` and the function takes two ``int32`` parameters and returns an ``int32``, the precompiled function name would be ``add_int32_int32``, but this convention is not mandatory as long as you can guarantee its uniqueness. * ``flags``: Optional flags for additional function attributes (default is 0). Please check out ``NativeFunction::kNeedsContext``, ``NativeFunction::kNeedsFunctionHolder``, and ``NativeFunction::kCanReturnErrors`` for more details. @@ -153,10 +153,10 @@ Not all Arrow data types are supported in Gandiva. The following table lists the | utf8 (as return type) | int64_t context, | | | const char*, | | | uint32_t* | -| | [see next section]| +| | [see next section]| +-------------------------------------+-------------------+ -Handling arrow::StringType (utf8 type) and arrow::BinaryType +Handling arrow::StringType (utf8 type) and arrow::BinaryType ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Both ``arrow::StringType`` and ``arrow::BinaryType`` are variable-length types. And they are handled similarly in external functions. Since ``arrow::StringType`` (utf8 type) is more commonly used, we will use it below as the example to explain how to handle variable-length types in external functions. @@ -179,7 +179,7 @@ When ``arrow::StringType`` (``utf8`` type) is used as the return type in a funct 2. **Function Parameters:** * **Context Parameter**: The C function should begin with an additional parameter, ``int64_t context``. This parameter is crucial for context management within the function. * **String Length Output Parameter**: The function should also include a ``uint32_t*`` parameter at the end. This output parameter will store the length of the returned string data. -3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. +3. **Return Value**: The function should return a ``const char*`` pointer, pointing to the string data. 4. **Function Implementation:** * **Memory Allocation and Error Messaging:** Within the function's implementation, use ``gdv_fn_context_arena_malloc`` and ``gdv_fn_context_set_error_msg`` for memory allocation and error messaging, respectively. Both functions take ``int64_t context`` as their first parameter, facilitating efficient context utilization. @@ -200,10 +200,10 @@ You can use ``gandiva::FunctionRegistry``'s APIs to register external C function NativeFunction func, void* c_function_ptr, std::optional function_holder_maker = std::nullopt); -The above API allows you to register an external C function. +The above API allows you to register an external C function. -* The ``NativeFunction`` object describes the metadata of the external C function. -* The ``c_function_ptr`` is the function pointer to the external C function's implementation. +* The ``NativeFunction`` object describes the metadata of the external C function. +* The ``c_function_ptr`` is the function pointer to the external C function's implementation. * The optional ``function_holder_maker`` is used to create a function holder for the external C function if the external C function requires a function holder. Check out the ``gandiva::FunctionHolder`` class and its several sub-classes for more details. External IR functions diff --git a/docs/source/cpp/getting_started.rst b/docs/source/cpp/getting_started.rst index 89bd4559ef1e6..2cab5d1581c1c 100644 --- a/docs/source/cpp/getting_started.rst +++ b/docs/source/cpp/getting_started.rst @@ -24,17 +24,17 @@ Getting Started The following articles demonstrate installation, use, and a basic understanding of Arrow. These articles will get you setup quickly using Arrow and give you a taste of what the library is capable of. -Specifically, it contains: an installation and linking guide; documentation of conventions used -in the codebase and suggested for users; and tutorials, including: +Specifically, it contains: an installation and linking guide; documentation of conventions used +in the codebase and suggested for users; and tutorials, including: -* Building Arrow arrays and tabular structures +* Building Arrow arrays and tabular structures * Reading and writing Parquet, Arrow, and CSV files * Executing compute kernels on arrays * Reading and writing multi-file partitioned datasets Start here to gain a basic understanding of Arrow, and move on to the :doc:`/cpp/user_guide` to -explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's -API. +explore more specific topics and underlying concepts, or the :doc:`/cpp/api` to explore Arrow's +API. .. toctree:: @@ -44,5 +44,3 @@ API. tutorials/io_tutorial.rst tutorials/compute_tutorial.rst tutorials/datasets_tutorial.rst - - diff --git a/docs/source/cpp/memory.rst b/docs/source/cpp/memory.rst index ad8276e3728a2..33907b5580f61 100644 --- a/docs/source/cpp/memory.rst +++ b/docs/source/cpp/memory.rst @@ -205,7 +205,7 @@ simply do:: Memory Profiling ================ -On Linux, detailed profiles of memory allocations can be generated using +On Linux, detailed profiles of memory allocations can be generated using ``perf record``, without any need to modify the binaries. These profiles can show the traceback in addition to allocation size. This does require debug symbols, from either a debug build or a release with debug symbols build. @@ -234,14 +234,14 @@ recorded allocations, so we can correlate them with the call to free/de-allocate .. tab-set:: .. tab-item:: jemalloc - + .. code-block:: shell - perf probe -x libarrow.so je_arrow_mallocx '$params' - perf probe -x libarrow.so je_arrow_mallocx%return '$retval' - perf probe -x libarrow.so je_arrow_rallocx '$params' - perf probe -x libarrow.so je_arrow_rallocx%return '$retval' - perf probe -x libarrow.so je_arrow_dallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx '$params' + perf probe -x libarrow.so je_arrow_mallocx%return '$retval' + perf probe -x libarrow.so je_arrow_rallocx '$params' + perf probe -x libarrow.so je_arrow_rallocx%return '$retval' + perf probe -x libarrow.so je_arrow_dallocx '$params' PROBE_ARGS="-e probe_libarrow:je_arrow_mallocx \ -e probe_libarrow:je_arrow_mallocx__return \ -e probe_libarrow:je_arrow_rallocx \ @@ -249,13 +249,13 @@ recorded allocations, so we can correlate them with the call to free/de-allocate -e probe_libarrow:je_arrow_dallocx" .. tab-item:: mimalloc - + .. code-block:: shell - perf probe -x libarrow.so mi_malloc_aligned '$params' - perf probe -x libarrow.so mi_malloc_aligned%return '$retval' - perf probe -x libarrow.so mi_realloc_aligned '$params' - perf probe -x libarrow.so mi_realloc_aligned%return '$retval' + perf probe -x libarrow.so mi_malloc_aligned '$params' + perf probe -x libarrow.so mi_malloc_aligned%return '$retval' + perf probe -x libarrow.so mi_realloc_aligned '$params' + perf probe -x libarrow.so mi_realloc_aligned%return '$retval' perf probe -x libarrow.so mi_free '$params' PROBE_ARGS="-e probe_libarrow:mi_malloc_aligned \ -e probe_libarrow:mi_malloc_aligned__return \ @@ -277,9 +277,9 @@ If you want to profile a running process, you can run ``perf record -p `` and it will record until you interrupt with CTRL+C. Alternatively, you can do ``perf record -P sleep 10`` to record for 10 seconds. -The resulting data can be processed with standard tools to work with perf or +The resulting data can be processed with standard tools to work with perf or ``perf script`` can be used to pipe a text format of the data to custom scripts. -The following script parses ``perf script`` output and prints the output in +The following script parses ``perf script`` output and prints the output in new lines delimited JSON for easier processing. .. code-block:: python @@ -354,7 +354,7 @@ Here's an example invocation of that script, with a preview of output data: From there one can answer a number of questions. For example, the following -script will find which allocations were never freed, and print the associated +script will find which allocations were never freed, and print the associated tracebacks along with the count of dangling allocations: .. code-block:: python diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 3e06352f5dde3..96897d139b351 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -51,8 +51,8 @@ FileReader ---------- To read Parquet data into Arrow structures, use :class:`arrow::FileReader`. -To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance -representing the input file. To read the whole file at once, +To construct, it requires a :class:`::arrow::io::RandomAccessFile` instance +representing the input file. To read the whole file at once, use :func:`arrow::FileReader::ReadTable`: .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -67,7 +67,7 @@ Finer-grained options are available through the and :class:`ArrowReaderProperties` classes. For reading as a stream of batches, use the :func:`arrow::FileReader::GetRecordBatchReader` -method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch +method to retrieve a :class:`arrow::RecordBatchReader`. It will use the batch size set in :class:`ArrowReaderProperties`. .. literalinclude:: ../../../cpp/examples/arrow/parquet_read_write.cc @@ -106,8 +106,8 @@ If memory efficiency is more important than performance, then: #. Turn on ``enable_buffered_stream`` in :class:`parquet::ReaderProperties`. In addition, if you know certain columns contain many repeated values, you can -read them as :term:`dictionary encoded` columns. This is -enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. +read them as :term:`dictionary encoded` columns. This is +enabled with the ``set_read_dictionary`` setting on :class:`ArrowReaderProperties`. If the files were written with Arrow C++ and the ``store_schema`` was activated, then the original Arrow schema will be automatically read and will override this setting. @@ -174,7 +174,7 @@ The :func:`arrow::WriteTable` function writes an entire .. note:: - Column compression is off by default in C++. See :ref:`below ` + Column compression is off by default in C++. See :ref:`below ` for how to choose a compression codec in the writer properties. To write out data batch-by-batch, use :class:`arrow::FileWriter`. @@ -191,9 +191,9 @@ StreamWriter The :class:`StreamWriter` allows for Parquet files to be written using standard C++ output operators, similar to reading with the :class:`StreamReader` -class. This type-safe approach also ensures that rows are written without -omitting fields and allows for new row groups to be created automatically -(after certain volume of data) or explicitly by using the :type:`EndRowGroup` +class. This type-safe approach also ensures that rows are written without +omitting fields and allows for new row groups to be created automatically +(after certain volume of data) or explicitly by using the :type:`EndRowGroup` stream modifier. Exceptions are used to signal errors. A :class:`ParquetException` is @@ -266,20 +266,20 @@ group that takes precedent over the ``chunk_size`` passed in the write methods. You can set the version of Parquet to write with ``version``, which determines which logical types are available. In addition, you can set the data page version with ``data_page_version``. It's V1 by default; setting to V2 will allow more -optimal compression (skipping compressing pages where there isn't a space +optimal compression (skipping compressing pages where there isn't a space benefit), but not all readers support this data page version. -Compression is off by default, but to get the most out of Parquet, you should -also choose a compression codec. You can choose one for the whole file or +Compression is off by default, but to get the most out of Parquet, you should +also choose a compression codec. You can choose one for the whole file or choose one for individual columns. If you choose a mix, the file-level option -will apply to columns that don't have a specific compression codec. See +will apply to columns that don't have a specific compression codec. See :class:`::arrow::Compression` for options. -Column data encodings can likewise be applied at the file-level or at the -column level. By default, the writer will attempt to dictionary encode all +Column data encodings can likewise be applied at the file-level or at the +column level. By default, the writer will attempt to dictionary encode all supported columns, unless the dictionary grows too large. This behavior can be changed at file-level or at the column level with ``disable_dictionary()``. -When not using dictionary encoding, it will fallback to the encoding set for +When not using dictionary encoding, it will fallback to the encoding set for the column or the overall file; by default ``Encoding::PLAIN``, but this can be changed with ``encoding()``. @@ -559,7 +559,7 @@ Encryption Parquet C++ implements all features specified in the `encryption specification `__, -except for encryption of column index and bloom filter modules. +except for encryption of column index and bloom filter modules. More specifically, Parquet C++ supports: diff --git a/docs/source/cpp/tables.rst b/docs/source/cpp/tables.rst index b28a9fc1e13a5..d98a2acde6620 100644 --- a/docs/source/cpp/tables.rst +++ b/docs/source/cpp/tables.rst @@ -81,13 +81,13 @@ and computation functions, possibly incremental. :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. -Record batches can be sent between implementations, such as via +Record batches can be sent between implementations, such as via :ref:`IPC ` or -via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and +via the :doc:`C Data Interface <../format/CDataInterface>`. Tables and chunked arrays, on the other hand, are concepts in the C++ implementation, not in the Arrow format itself, so they aren't directly portable. -However, a table can be converted to and built from a sequence of record +However, a table can be converted to and built from a sequence of record batches easily without needing to copy the underlying array buffers. A table can be streamed as an arbitrary number of record batches using a :class:`arrow::TableBatchReader`. Conversely, a logical sequence of diff --git a/docs/source/cpp/threading.rst b/docs/source/cpp/threading.rst index 24ad25b5a028a..4a1a65ffe012d 100644 --- a/docs/source/cpp/threading.rst +++ b/docs/source/cpp/threading.rst @@ -99,4 +99,4 @@ Arrow C++ uses :class:`arrow::Future` to communicate results between threads. T an :class:`arrow::Future` will be created when an operation needs to perform some kind of long running task that will block for some period of time. :class:`arrow::Future` objects are mainly meant for internal use and any method that returns an -:class:`arrow::Future` will usually have a synchronous variant as well. \ No newline at end of file +:class:`arrow::Future` will usually have a synchronous variant as well. diff --git a/docs/source/cpp/tutorials/compute_tutorial.rst b/docs/source/cpp/tutorials/compute_tutorial.rst index bcb87e6a8f992..a650865d75ce4 100644 --- a/docs/source/cpp/tutorials/compute_tutorial.rst +++ b/docs/source/cpp/tutorials/compute_tutorial.rst @@ -34,7 +34,7 @@ functionality to: 3. Search for a value in a column -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -49,16 +49,16 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. ``A main()`` is needed to glue things together. 3. We need data to play with. - + Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/compute_example.cc :language: cpp @@ -340,4 +340,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Compute Example) :end-before: (Doc section: Compute Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/datasets_tutorial.rst b/docs/source/cpp/tutorials/datasets_tutorial.rst index 285fc24d8d599..f60e1e52170ae 100644 --- a/docs/source/cpp/tutorials/datasets_tutorial.rst +++ b/docs/source/cpp/tutorials/datasets_tutorial.rst @@ -33,7 +33,7 @@ file on disk. In this article, you will: 2. write out a partitioned dataset from a Table. -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before running some computations, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need data on disk to play with. @@ -58,8 +58,8 @@ Before running some computations, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -compute functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +compute functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc :language: cpp @@ -206,7 +206,7 @@ Build Dataset using Factory ^^^^^^^^^^^^^^^^^^^^^^^^^^^ With a :class:`dataset::FileSystemDatasetFactory` set up, we can actually build our -:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just +:class:`dataset::Dataset` with :func:`dataset::FileSystemDatasetFactory::Finish`, just like with an :class:`ArrayBuilder` back in the basic tutorial: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -228,14 +228,14 @@ dataset, and print those out, along with some small info: Move Dataset into Table ^^^^^^^^^^^^^^^^^^^^^^^ -One way we can do something with :class:`Datasets ` is getting -them into a :class:`Table`, where we can do anything we’ve learned we can do to -:class:`Tables
` to that :class:`Table`. +One way we can do something with :class:`Datasets ` is getting +them into a :class:`Table`, where we can do anything we’ve learned we can do to +:class:`Tables
` to that :class:`Table`. .. seealso:: :doc:`/cpp/streaming_execution` for execution that avoids manifesting the entire dataset in memory. -In order to move a :class:`Dataset’s ` contents into a :class:`Table`, -we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. +In order to move a :class:`Dataset’s ` contents into a :class:`Table`, +we need a :class:`dataset::Scanner`, which scans the data and outputs it to the :class:`Table`. First, we get a :class:`dataset::ScannerBuilder` from the :class:`dataset::Dataset`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -305,7 +305,7 @@ Create Scanner for Moving Table Data The process for writing a :class:`dataset::Dataset`, once a source of data is available, is similar to the reverse of reading it. Before, we used a :class:`dataset::Scanner` in order to scan into a :class:`Table` – now, we need one to read out of our -:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` +:class:`TableBatchReader`. To get that :class:`dataset::Scanner`, we’ll make a :class:`dataset::ScannerBuilder` based on our :class:`TableBatchReader`, then use that Builder to build a :class:`dataset::Scanner`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/dataset_example.cc @@ -343,7 +343,7 @@ Arrow, so we’ll write back out to that: :start-after: (Doc section: Write Format) :end-before: (Doc section: Write Format) -Configure FileSystemDatasetWriteOptions +Configure FileSystemDatasetWriteOptions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In order to write to disk, we need some configuration. We’ll do so via @@ -435,11 +435,11 @@ tutorials. With that, you’ve read and written partitioned datasets! This method, with some configuration, will work for any supported dataset format. For an example of such a dataset, the NYC Taxi dataset is a well-known -one, which you can find `here `_. +one, which you can find `here `_. Now you can get larger-than-memory data mapped for use! Which means that now we have to be able to process this data without -pulling it all into memory at once. For this, try Acero. +pulling it all into memory at once. For this, try Acero. .. seealso:: :doc:`/cpp/streaming_execution` for more information on Acero. @@ -450,4 +450,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: Dataset Example) :end-before: (Doc section: Dataset Example) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/cpp/tutorials/io_tutorial.rst b/docs/source/cpp/tutorials/io_tutorial.rst index f981c94b83e32..309f10a350aa3 100644 --- a/docs/source/cpp/tutorials/io_tutorial.rst +++ b/docs/source/cpp/tutorials/io_tutorial.rst @@ -33,7 +33,7 @@ the start to end of an application. In this article, you will: 3. Read a Parquet file into a :class:`Table` and write it back out afterwards -Pre-requisites +Pre-requisites --------------- Before continuing, make sure you have: @@ -50,7 +50,7 @@ Setup Before writing out some file I/O, we need to fill in a couple gaps: 1. We need to include necessary headers. - + 2. A ``main()`` is needed to glue things together. 3. We need files to play with. @@ -58,8 +58,8 @@ Before writing out some file I/O, we need to fill in a couple gaps: Includes ^^^^^^^^ -Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's -I/O functionality for each file type we'll work with in this article: +Before writing C++ code, we need some includes. We'll get ``iostream`` for output, then import Arrow's +I/O functionality for each file type we'll work with in this article: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc :language: cpp @@ -156,8 +156,8 @@ Opening an Arrow file Reader ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An :class:`io::ReadableFile` is too generic to offer all functionality to read an Arrow file. -We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements -all the logic needed to read an Arrow file with correct formatting. We get one through +We need to use it to get an :class:`ipc::RecordBatchFileReader` object. This object implements +all the logic needed to read an Arrow file with correct formatting. We get one through :func:`ipc::RecordBatchFileReader::Open`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -294,8 +294,8 @@ Write a CSV File from Table CSV writing to :class:`Table` looks exactly like IPC writing to :class:`RecordBatch`, except with our :class:`Table`, and using :func:`ipc::RecordBatchWriter::WriteTable` instead of -:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- -we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target +:func:`ipc::RecordBatchWriter::WriteRecordBatch`. Note that the same writer class is used -- +we're writing with :func:`ipc::RecordBatchWriter::WriteTable` because we have a :class:`Table`. We’ll target a file, use our :class:`Table’s
` :class:`Schema`, and then write the :class:`Table`: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -358,7 +358,7 @@ even though we used :func:`io::ReadableFile::Open`. Note that we pass our Reading a Parquet File to Table ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a +With a prepared :class:`parquet::arrow::FileReader` in hand, we can read to a :class:`Table`, except we must pass the :class:`Table` by reference instead of outputting to it: .. literalinclude:: ../../../../cpp/examples/tutorial_examples/file_access_example.cc @@ -401,4 +401,4 @@ Refer to the below for a copy of the complete code: :start-after: (Doc section: File I/O) :end-before: (Doc section: File I/O) :linenos: - :lineno-match: \ No newline at end of file + :lineno-match: diff --git a/docs/source/developers/continuous_integration/index.rst b/docs/source/developers/continuous_integration/index.rst index f988b5ab69d50..cfca14e10e48c 100644 --- a/docs/source/developers/continuous_integration/index.rst +++ b/docs/source/developers/continuous_integration/index.rst @@ -27,4 +27,4 @@ Continuous Integration overview docker archery - crossbow \ No newline at end of file + crossbow diff --git a/docs/source/developers/cpp/building.rst b/docs/source/developers/cpp/building.rst index 5fab745679e93..b052b856c9bd5 100644 --- a/docs/source/developers/cpp/building.rst +++ b/docs/source/developers/cpp/building.rst @@ -67,7 +67,7 @@ On Alpine Linux: gcc \ ninja \ make - + On Fedora Linux: .. code-block:: shell @@ -99,7 +99,7 @@ On macOS, you can use `Homebrew `_: With `vcpkg `_: .. code-block:: shell - + git clone https://github.com/apache/arrow.git cd arrow vcpkg install \ @@ -312,7 +312,7 @@ depends on ``python`` being available). On some Linux distributions, running the test suite might require setting an explicit locale. If you see any locale-related errors, try setting the -environment variable (which requires the `locales` package or equivalent): +environment variable (which requires the ``locales`` package or equivalent): .. code-block:: @@ -362,7 +362,7 @@ boolean flags to ``cmake``. * ``-DARROW_GCS=ON``: Build Arrow with GCS support (requires the GCloud SDK for C++) * ``-DARROW_HDFS=ON``: Arrow integration with libhdfs for accessing the Hadoop Filesystem -* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default +* ``-DARROW_JEMALLOC=ON``: Build the Arrow jemalloc-based allocator, on by default * ``-DARROW_JSON=ON``: JSON reader module * ``-DARROW_MIMALLOC=ON``: Build the Arrow mimalloc-based allocator * ``-DARROW_ORC=ON``: Arrow integration with Apache ORC @@ -375,7 +375,7 @@ boolean flags to ``cmake``. instead. * ``-DARROW_S3=ON``: Support for Amazon S3-compatible filesystems * ``-DARROW_SUBSTRAIT=ON``: Build with support for Substrait -* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 +* ``-DARROW_WITH_RE2=ON``: Build with support for regular expressions using the re2 library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` is ``ON`` * ``-DARROW_WITH_UTF8PROC=ON``: Build with support for Unicode properties using the utf8proc library, on by default and used when ``ARROW_COMPUTE`` or ``ARROW_GANDIVA`` @@ -472,7 +472,7 @@ The build system supports a number of third-party dependencies * ``c-ares``: a dependency of gRPC * ``gflags``: for command line utilities (formerly Googleflags) * ``GLOG``: for logging - * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires + * ``google_cloud_cpp_storage``: for Google Cloud Storage support, requires system cURL and can use the ``BUNDLED`` method described below * ``gRPC``: for remote procedure calls * ``GTest``: Googletest, for testing @@ -627,9 +627,10 @@ outputs like: Deprecations and API Changes ---------------------------- -We use the compiler definition ``ARROW_NO_DEPRECATED_API`` to disable APIs that -have been deprecated. It is a good practice to compile third party applications -with this flag to proactively catch and account for API changes. +We use the marco ``ARROW_DEPRECATED`` which wraps C++ deprecated attribute for +APIs that have been deprecated. It is a good practice to compile third party +applications with ``-Werror=deprecated-declarations`` (for GCC/Clang or similar +flags of other compilers) to proactively catch and account for API changes. Modular Build Targets --------------------- diff --git a/docs/source/developers/cpp/windows.rst b/docs/source/developers/cpp/windows.rst index 251a45325fe0b..60ac949e81663 100644 --- a/docs/source/developers/cpp/windows.rst +++ b/docs/source/developers/cpp/windows.rst @@ -379,9 +379,9 @@ Downloading the Timezone Database ================================= To run some of the compute unit tests on Windows, the IANA timezone database -and the Windows timezone mapping need to be downloaded first. See +and the Windows timezone mapping need to be downloaded first. See :ref:`download-timezone-database` for download instructions. To set a non-default -path for the timezone database while running the unit tests, set the +path for the timezone database while running the unit tests, set the ``ARROW_TIMEZONE_DATABASE`` environment variable. Replicating Appveyor Builds diff --git a/docs/source/developers/documentation.rst b/docs/source/developers/documentation.rst index 8b1ea28c0f54b..a479065f6297e 100644 --- a/docs/source/developers/documentation.rst +++ b/docs/source/developers/documentation.rst @@ -259,7 +259,7 @@ Build the docs in the target directory: sphinx-build ./source/developers ./source/developers/_build -c ./source -D master_doc=temp_index This builds everything in the target directory to a folder inside of it -called ``_build`` using the config file in the `source` directory. +called ``_build`` using the config file in the ``source`` directory. Once you have verified the HTML documents, you can remove temporary index file: diff --git a/docs/source/developers/guide/architectural_overview.rst b/docs/source/developers/guide/architectural_overview.rst index 58e05c85f457e..085a814453c84 100644 --- a/docs/source/developers/guide/architectural_overview.rst +++ b/docs/source/developers/guide/architectural_overview.rst @@ -29,8 +29,8 @@ Architectural Overview ********************** -A general overview of Apache Arrow project can be found on the -`front page `_ and in the +A general overview of Apache Arrow project can be found on the +`front page `_ and in the `Apache Arrow Overview `_. You can also have a look at the `Frequently Asked Questions `_. diff --git a/docs/source/developers/guide/communication.rst b/docs/source/developers/guide/communication.rst index a8659f83ac04d..749c94f9419b2 100644 --- a/docs/source/developers/guide/communication.rst +++ b/docs/source/developers/guide/communication.rst @@ -27,7 +27,7 @@ .. _communication: ************* -Communication +Communication ************* **About the contributors** @@ -50,7 +50,7 @@ tags ([C++], [R], [Ruby] etc.) so it gets noticed by the right people. Where to get help 👋 ==================== -For any question you may have or problems you are facing you can write to +For any question you may have or problems you are facing you can write to user or development :ref:`mailing_list` or you can create an issue on :ref:`github`. Also use GitHub to search through the issues, report bugs and create feature requests or proposals. diff --git a/docs/source/developers/guide/documentation.rst b/docs/source/developers/guide/documentation.rst index 3bb3bebef5098..8f9d7311e765f 100644 --- a/docs/source/developers/guide/documentation.rst +++ b/docs/source/developers/guide/documentation.rst @@ -49,7 +49,7 @@ documentation itself, you can search for an issue in GitHub. Documentation improvements are also a great way to gain some experience with our submission and review process without -requiring a lot of local development environment setup. +requiring a lot of local development environment setup. .. note:: Many documentation-only changes can be made directly in the @@ -114,4 +114,3 @@ library. Source folder includes: **Cookbooks** have their own repository ``_ and can be separately cloned and built. - diff --git a/docs/source/developers/guide/index.rst b/docs/source/developers/guide/index.rst index 353c8332ff0b5..0ed27a0ddc54e 100644 --- a/docs/source/developers/guide/index.rst +++ b/docs/source/developers/guide/index.rst @@ -83,17 +83,17 @@ of adding a basic feature. the installation of third-party packages, depending on which build options and components you enable. The C++ build guide has suggestions for commonly encountered issues - you can find it - :ref:`here `. + :ref:`here `. Anytime you are stuck, feel free to reach out via appropriate :ref:`communication` channel. - See a short description about the building process of + See a short description about the building process of :ref:`PyArrow or the R package` or go straight to detailed instructions on how to build one of Arrow libraries in the `documentation `_ . - + #. **Run the tests** - + We should run the tests to check if everything is working correctly. For example, you can run the tests from a terminal for Python @@ -155,7 +155,7 @@ There are lots of ways to contribute to the project besides writing code! * Improving the **documentation** is a great way to start contributing! For more information visit :ref:`documentation` section of the guide. -* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems +* **Apache Arrow Cookbooks** are a collection of recipes for solving various problems and completing different tasks using Apache Arrow. They are also a great way to start contributing. For more information visit `How to contribute to Apache Arrow Cookbook `_ diff --git a/docs/source/developers/guide/resources.rst b/docs/source/developers/guide/resources.rst index f350f469af403..b5905af65499b 100644 --- a/docs/source/developers/guide/resources.rst +++ b/docs/source/developers/guide/resources.rst @@ -78,7 +78,7 @@ Reproducible examples: - `Tidyverse: Make a reprex `_ - `Craft Minimal Bug Reports by Matthew Rocklin `_ -Recommended references +Recommended references ---------------------- - Slatkin, Brett, *Effective Python: 90 Specific Ways to Write Better Python*, Addison-Wesley Professional, 2019 diff --git a/docs/source/developers/guide/step_by_step/arrow_codebase.rst b/docs/source/developers/guide/step_by_step/arrow_codebase.rst index 0beece991b197..0c194ab3a3f70 100644 --- a/docs/source/developers/guide/step_by_step/arrow_codebase.rst +++ b/docs/source/developers/guide/step_by_step/arrow_codebase.rst @@ -99,8 +99,8 @@ can be called from a function in another language. After a function is defined C++ we must create the binding manually to use it in that implementation. .. note:: - There is much you can learn by checking **Pull Requests** - and **unit tests** for similar issues. + There is much you can learn by checking **Pull Requests** + and **unit tests** for similar issues. .. tab-set:: diff --git a/docs/source/developers/guide/step_by_step/finding_issues.rst b/docs/source/developers/guide/step_by_step/finding_issues.rst index 390c56a81c73f..a76b15e917e9a 100644 --- a/docs/source/developers/guide/step_by_step/finding_issues.rst +++ b/docs/source/developers/guide/step_by_step/finding_issues.rst @@ -65,7 +65,7 @@ person who triaged the ticket expected it to be. Don't hesitate to write that in the comments. .. note:: - + When you find a GitHub issue you would like to work on, please mention your interest in the comment section of that issue; that way we will know you are working on it. diff --git a/docs/source/developers/guide/step_by_step/set_up.rst b/docs/source/developers/guide/step_by_step/set_up.rst index 60b472637badb..9c808ceee7be6 100644 --- a/docs/source/developers/guide/step_by_step/set_up.rst +++ b/docs/source/developers/guide/step_by_step/set_up.rst @@ -60,7 +60,7 @@ a username and password each time you execute a git command. RStudio project and will create a ``.Rproj`` file in the root directory. For this reason it is *highly recommended* to clone the repository using the command line or a Git client. - + Get the source code =================== @@ -118,10 +118,10 @@ Should give you a result similar to this: .. code:: console - origin https://github.com//arrow.git (fetch) - origin https://github.com//arrow.git (push) - upstream https://github.com/apache/arrow (fetch) - upstream https://github.com/apache/arrow (push) + origin https://github.com//arrow.git (fetch) + origin https://github.com//arrow.git (push) + upstream https://github.com/apache/arrow (fetch) + upstream https://github.com/apache/arrow (push) If you did everything correctly, you should now have a copy of the code in the ``arrow`` directory and two remotes that refer to your own GitHub diff --git a/docs/source/developers/guide/step_by_step/styling.rst b/docs/source/developers/guide/step_by_step/styling.rst index bb428b0b6ab40..c155acb389512 100644 --- a/docs/source/developers/guide/step_by_step/styling.rst +++ b/docs/source/developers/guide/step_by_step/styling.rst @@ -59,4 +59,4 @@ check your code and will stop the commit process, described in the following section, if there are any errors. - `Pre-commit installation instructions `_ -- `Pre-commit hooks `_ \ No newline at end of file +- `Pre-commit hooks `_ diff --git a/docs/source/developers/guide/tutorials/index.rst b/docs/source/developers/guide/tutorials/index.rst index dcefab23230f9..5f44231afc9c2 100644 --- a/docs/source/developers/guide/tutorials/index.rst +++ b/docs/source/developers/guide/tutorials/index.rst @@ -25,4 +25,4 @@ Tutorials :maxdepth: 1 python_tutorial - r_tutorial \ No newline at end of file + r_tutorial diff --git a/docs/source/developers/guide/tutorials/python_tutorial.rst b/docs/source/developers/guide/tutorials/python_tutorial.rst index 7f004160b0e75..c12c4489aee95 100644 --- a/docs/source/developers/guide/tutorials/python_tutorial.rst +++ b/docs/source/developers/guide/tutorials/python_tutorial.rst @@ -137,7 +137,7 @@ function is defined in the ``compute.py`` file. After examining the ``compute.py`` file we can see that together with ``_compute.pyx`` the functions from C++ get wrapped into Python. -We will define the new feature at the end of the ``compute.py`` file. +We will define the new feature at the end of the ``compute.py`` file. Lets run some code in the Python console from ``arrow/python`` directory in order to learn more about ``pc.min_max``. @@ -147,10 +147,10 @@ directory in order to learn more about ``pc.min_max``. $ cd python $ python - Python 3.9.7 (default, Oct 22 2021, 13:24:00) + Python 3.9.7 (default, Oct 22 2021, 13:24:00) [Clang 13.0.0 (clang-1300.0.29.3)] on darwin Type "help", "copyright", "credits" or "license" for more information. - + We have entered into the Python console from the shell and we can do some research: @@ -278,7 +278,7 @@ options for the ``pc.min_max`` function we can finish the work. return pa.scalar([('min-', min_t), ('max+', max_t)], type=ty) .. TODO seealso - .. For more information about the Arrow codebase visit + .. For more information about the Arrow codebase visit .. :ref:``. (link to working on the Arrow codebase section) Adding a test @@ -303,24 +303,24 @@ a specific unit test, pass in the test name to the ``-k`` parameter. .. code:: console $ cd python - $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max + $ python -m pytest pyarrow/tests/test_compute.py -k test_tutorial_min_max ======================== test session starts ========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items / 203 deselected / 1 selected + collected 204 items / 203 deselected / 1 selected pyarrow/tests/test_compute.py . [100%] ======================== 1 passed, 203 deselected in 0.16s ============ - - $ python -m pytest pyarrow/tests/test_compute.py + + $ python -m pytest pyarrow/tests/test_compute.py ======================== test session starts =========================== platform darwin -- Python 3.9.7, pytest-6.2.5, py-1.10.0, pluggy-1.0.0 rootdir: /Users/alenkafrim/repos/arrow/python, configfile: setup.cfg plugins: hypothesis-6.24.1, lazy-fixture-0.6.3 - collected 204 items + collected 204 items pyarrow/tests/test_compute.py ................................... [ 46%] ................................................. [100%] @@ -339,7 +339,7 @@ utility called `Archery ` to check if code is in line with PEP 8 style guide. .. code:: console - + $ archery lint --python --fix INFO:archery:Running Python formatter (autopep8) INFO:archery:Running Python linter (flake8) @@ -430,7 +430,7 @@ to the branch history): $ git commit -am "Adding a new compute feature for tutorial purposes" [ARROW-14977 170ef85be] Adding a new compute feature for tutorial purposes 2 files changed, 51 insertions(+) - + We can use ``git log`` to check the history of commits: @@ -448,12 +448,12 @@ We can use ``git log`` to check the history of commits: Date: Sun Dec 5 15:19:46 2021 +0900 ARROW-14981: [CI][Docs] Upload built documents - + We can use this in release process instead of building on release manager's local environment. - + Closes #11856 from kou/ci-docs-upload - + Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei ... @@ -478,10 +478,10 @@ called ``origin``. Writing objects: 100% (7/7), 1.19 KiB | 1.19 MiB/s, done. Total 7 (delta 6), reused 0 (delta 0), pack-reused 0 remote: Resolving deltas: 100% (6/6), completed with 6 local objects. - remote: + remote: remote: Create a pull request for 'ARROW-14977' on GitHub by visiting: remote: https://github.com/AlenkaF/arrow/pull/new/ARROW-14977 - remote: + remote: To https://github.com/AlenkaF/arrow.git * [new branch] ARROW-14977 -> ARROW-14977 @@ -490,7 +490,7 @@ to create a Pull Request. On the GitHub Arrow page (main or forked) we will see a yellow notice bar with a note that we made recent pushes to the branch ARROW-14977. That’s great, now we can make the Pull Request -by clicking on **Compare & pull request**. +by clicking on **Compare & pull request**. .. figure:: ../../images/python_tutorial_github_pr_notice.jpeg :scale: 50 % @@ -527,5 +527,5 @@ the code, comment, resolve conversations and so on. The Pull Request we made can be viewed `here `_. .. seealso:: - + For more information about Pull Request workflow see :ref:`pr_lifecycle`. diff --git a/docs/source/developers/java/building.rst b/docs/source/developers/java/building.rst index c059ff676efb2..82053e901186c 100644 --- a/docs/source/developers/java/building.rst +++ b/docs/source/developers/java/building.rst @@ -350,7 +350,7 @@ Arrow repository, and update the following settings: * To enable debugging JNI-based modules like ``dataset``, activate specific profiles in the Maven tab under "Profiles". Ensure the profiles ``arrow-c-data``, ``arrow-jni``, ``generate-libs-cdata-all-os``, - ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the + ``generate-libs-jni-macos-linux``, and ``jdk11+`` are enabled, so that the IDE can build them and enable debugging. You may not need to update all of these settings if you build/test with the diff --git a/docs/source/developers/java/development.rst b/docs/source/developers/java/development.rst index 17d47c324ce12..3f0ff6cdd0103 100644 --- a/docs/source/developers/java/development.rst +++ b/docs/source/developers/java/development.rst @@ -118,7 +118,7 @@ This checks the code style of all source code under the current directory or fro $ mvn checkstyle:check -Maven `pom.xml` style is enforced with Spotless using `Apache Maven pom.xml guidelines`_ +Maven ``pom.xml`` style is enforced with Spotless using `Apache Maven pom.xml guidelines`_ You can also just check the style without building the project. This checks the style of all pom.xml files under the current directory or from within an individual module. diff --git a/docs/source/developers/overview.rst b/docs/source/developers/overview.rst index c7bc4273313bc..5a18b1e4eb8db 100644 --- a/docs/source/developers/overview.rst +++ b/docs/source/developers/overview.rst @@ -75,7 +75,7 @@ checklist for using ``git``: locally, for example if additional commits have been made by a colleague. By using ``--force-with-lease`` instead of ``--force``, you ensure those commits are not overwritten and can fetch those changes if desired. - + .. dropdown:: Setting rebase to be default :animate: fade-in-slide-down :class-container: sd-shadow-none @@ -202,4 +202,3 @@ Implementations that do not intend to implement cross endian support: For other libraries, a discussion to gather consensus on the mailing-list should be had before submitting PRs. - diff --git a/docs/source/developers/release.rst b/docs/source/developers/release.rst index e7431ce0fb7b9..d903cc71bd5c4 100644 --- a/docs/source/developers/release.rst +++ b/docs/source/developers/release.rst @@ -80,10 +80,10 @@ Ensure local tags are removed, gpg-agent is set and JIRA tickets are correctly a # Delete the local tag for RC1 or later git tag -d apache-arrow- - + # Setup gpg agent for signing artifacts source dev/release/setup-gpg-agent.sh - + # Curate the release # The end of the generated report shows the JIRA tickets with wrong version number assigned. archery release curate @@ -106,7 +106,7 @@ If there is consensus and there is a Release Manager willing to take the effort the release a patch release can be created. Committers can tag issues that should be included on the next patch release using the -`backport-candidate` label. Is the responsability of the author or the committer to add the +``backport-candidate`` label. Is the responsability of the author or the committer to add the label to the issue to help the Release Manager identify the issues that should be backported. If a specific issue is identified as the reason to create a patch release the Release Manager @@ -117,7 +117,7 @@ Be sure to go through on the following checklist: #. Create milestone #. Create maintenance branch #. Include issue that was requested as requiring new patch release -#. Add new milestone to issues with `backport-candidate` label +#. Add new milestone to issues with ``backport-candidate`` label #. cherry-pick issues into maintenance branch Creating a Release Candidate @@ -180,7 +180,7 @@ Create the Release Candidate branch from the updated maintenance branch # Start from the updated maintenance branch. git checkout maint-X.Y.Z - + # The following script will create a branch for the Release Candidate, # place the necessary commits updating the version number and then create a git tag # on OSX use gnu-sed with homebrew: brew install gnu-sed (and export to $PATH) @@ -188,7 +188,7 @@ Create the Release Candidate branch from the updated maintenance branch # starts at 0 and increments every time the Release Candidate is burned # so for the first RC this would be: dev/release/01-prepare.sh 4.0.0 5.0.0 0 dev/release/01-prepare.sh - + # Push the release tag (for RC1 or later the --force flag is required) git push -u apache apache-arrow- # Push the release candidate branch in order to trigger verification jobs later @@ -201,23 +201,23 @@ Build source and binaries and submit them # Build the source release tarball and create Pull Request with verification tasks dev/release/02-source.sh - + # Submit binary tasks using crossbow, the command will output the crossbow build id dev/release/03-binary-submit.sh - + # Wait for the crossbow jobs to finish archery crossbow status - + # Download the produced binaries # This will download packages to a directory called packages/release--rc dev/release/04-binary-download.sh - + # Sign and upload the binaries # # On macOS the only way I could get this to work was running "echo "UPDATESTARTUPTTY" | gpg-connect-agent" before running this comment # otherwise I got errors referencing "ioctl" errors. dev/release/05-binary-upload.sh - + # Sign and upload the Java artifacts # # Note that you need to press the "Close" button manually by Web interface diff --git a/docs/source/developers/release_verification.rst b/docs/source/developers/release_verification.rst index 8c301b44a3c42..afd220db6010d 100644 --- a/docs/source/developers/release_verification.rst +++ b/docs/source/developers/release_verification.rst @@ -55,7 +55,7 @@ and test the result on their own platform in order to cast a +1 vote. # this will create and automatically clean up a temporary directory for the verification environment and will run the source verification TEST_DEFAULT=0 TEST_SOURCE=1 verify-release-candidate.sh $VERSION $RC_NUM - + # to verify only certain implementations use the TEST_DEFAULT=0 and TEST_* variables # here are a couple of examples, but see the source code for the available options TEST_DEFAULT=0 TEST_CPP=1 verify-release-candidate.sh $VERSION $RC_NUM # only C++ tests diff --git a/docs/source/developers/reviewing.rst b/docs/source/developers/reviewing.rst index b6e0c1f4023bd..1550d6aa7ce61 100644 --- a/docs/source/developers/reviewing.rst +++ b/docs/source/developers/reviewing.rst @@ -260,14 +260,14 @@ Social aspects Labelling ========= -While reviewing PRs, we should try to identify whether the corresponding issue +While reviewing PRs, we should try to identify whether the corresponding issue needs to be marked with one or both of the following issue labels: * **Critical Fix**: The change fixes either: (a) a security vulnerability; (b) a bug that causes incorrect or invalid data to be produced; or (c) a bug that causes a crash (while the API contract is upheld). This is intended to mark fixes to issues that may affect users without their - knowledge. For this reason, fixing bugs that cause errors don't count, since + knowledge. For this reason, fixing bugs that cause errors don't count, since those bugs are usually obvious. Bugs that cause crashes are considered critical because they are a possible vector of Denial-of-Service attacks. * **Breaking Change**: The change breaks backwards compatibility in a public API. @@ -275,7 +275,7 @@ needs to be marked with one or both of the following issue labels: compatibility, except for the few places where we do guarantee ABI compatibility (such as C Data Interface). Experimental APIs are *not* exempt from this; they are just more likely to be associated with this tag. - + Breaking changes and critical fixes are separate: breaking changes alter the API contract, while critical fixes make the implementation align with the existing API contract. For example, fixing a bug that caused a Parquet reader diff --git a/docs/source/format/CDataInterface/PyCapsuleInterface.rst b/docs/source/format/CDataInterface/PyCapsuleInterface.rst index 03095aa2e9356..67f77f53f012b 100644 --- a/docs/source/format/CDataInterface/PyCapsuleInterface.rst +++ b/docs/source/format/CDataInterface/PyCapsuleInterface.rst @@ -64,7 +64,7 @@ structures should be wrapped in capsules. Capsules avoid invalid access by attaching a name to the pointer and avoid memory leaks by attaching a destructor. Thus, they are much safer than passing pointers as integers. -`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing +`PyCapsule`_ allows for a ``name`` to be associated with the capsule, allowing consumers to verify that the capsule contains the expected kind of data. To make sure Arrow structures are recognized, the following names must be used: @@ -133,8 +133,8 @@ Arrays and record batches (contiguous tables) can implement the method Export the object as a pair of ArrowSchema and ArrowArray structures. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -152,8 +152,8 @@ Tables / DataFrames and streams can implement the method ``__arrow_c_stream__``. Export the object as an ArrowArrayStream. - :param requested_schema: A PyCapsule containing a C ArrowSchema representation - of a requested schema. Conversion to this schema is best-effort. See + :param requested_schema: A PyCapsule containing a C ArrowSchema representation + of a requested schema. Conversion to this schema is best-effort. See `Schema Requests`_. :type requested_schema: PyCapsule or None @@ -192,7 +192,7 @@ schema transformations. Protocol Typehints ------------------ -The following typehints can be copied into your library to annotate that a +The following typehints can be copied into your library to annotate that a function accepts an object implementing one of these protocols. .. code-block:: python @@ -248,7 +248,7 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for } free(schema); } - + PyObject* ExportArrowSchemaPyCapsule() { struct ArrowSchema* schema = (struct ArrowSchema*)malloc(sizeof(struct ArrowSchema)); @@ -270,9 +270,9 @@ Below is the code to create a PyCapsule for an ``ArrowSchema``. The code for ) if schema.release != NULL: schema.release(schema) - + free(schema) - + cdef object export_arrow_schema_py_capsule(): cdef ArrowSchema* schema = malloc(sizeof(ArrowSchema)) # It's recommended to immediately wrap the struct in a capsule, so @@ -305,7 +305,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: c #include - + // If the capsule is not an ArrowSchema, will return NULL and set an exception. struct ArrowSchema* GetArrowSchemaPyCapsule(PyObject* capsule) { return PyCapsule_GetPointer(capsule, "arrow_schema"); @@ -316,7 +316,7 @@ code for ``ArrowArray`` and ``ArrowArrayStream`` is similar. .. code-block:: cython cimport cpython - + cdef ArrowSchema* get_arrow_schema_py_capsule(object capsule) except NULL: return cpython.PyCapsule_GetPointer(capsule, 'arrow_schema') @@ -429,7 +429,7 @@ implementing the DataFrame Interchange Protocol. Comparison to ``__arrow_array__`` protocol ------------------------------------------ -The :ref:`arrow_array_protocol` protocol is a dunder method that +The :ref:`arrow_array_protocol` protocol is a dunder method that defines how PyArrow should import an object as an Arrow array. Unlike this protocol, it is specific to PyArrow and isn't used by other libraries. It is -also limited to arrays and does not support schemas, tabular structures, or streams. \ No newline at end of file +also limited to arrays and does not support schemas, tabular structures, or streams. diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 1f055b7f8edb5..c258f889dc6ac 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -51,7 +51,7 @@ types: 3) Its serialization *must* be described in the proposal and should not require unduly implementation work or unusual software dependencies - (for example, a trivial custom text format or JSON would be acceptable). + (for example, a trivial custom text format or a JSON-based format would be acceptable). 4) Its expected semantics *should* be described as well and any potential ambiguities or pain points addressed or at least mentioned. @@ -77,7 +77,7 @@ Official List Fixed shape tensor ================== -* Extension name: `arrow.fixed_shape_tensor`. +* Extension name: ``arrow.fixed_shape_tensor``. * The storage type of the extension: ``FixedSizeList`` where: @@ -153,7 +153,7 @@ Fixed shape tensor Variable shape tensor ===================== -* Extension name: `arrow.variable_shape_tensor`. +* Extension name: ``arrow.variable_shape_tensor``. * The storage type of the extension is: ``StructArray`` where struct is composed of **data** and **shape** fields describing a single @@ -251,6 +251,38 @@ Variable shape tensor Values inside each **data** tensor element are stored in row-major/C-contiguous order according to the corresponding **shape**. +.. _json_extension: + +JSON +==== + +* Extension name: ``arrow.json``. + +* The storage type of this extension is ``String`` or + or ``LargeString`` or ``StringView``. + Only UTF-8 encoded JSON as specified in `rfc8259`_ is supported. + +* Extension type parameters: + + This type does not have any parameters. + +* Description of the serialization: + + Metadata is either an empty string or a JSON string with an empty object. + In the future, additional fields may be added, but they are not required + to interpret the array. + +UUID +==== + +* Extension name: ``arrow.uuid``. + +* The storage type of the extension is ``FixedSizeBinary`` with a length of 16 bytes. + +.. note:: + A specific UUID version is not required or guaranteed. This extension represents + UUIDs as FixedSizeBinary(16) with big-endian notation and does not interpret the bytes in any way. + ========================= Community Extension Types ========================= @@ -268,3 +300,5 @@ GeoArrow Arrow extension types for representing vector geometries. It is well known within the Arrow geospatial subcommunity. The GeoArrow specification is not yet finalized. + +.. _rfc8259: https://datatracker.ietf.org/doc/html/rfc8259 diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst index 0cfece2586294..7c853de7829be 100644 --- a/docs/source/format/Columnar.rst +++ b/docs/source/format/Columnar.rst @@ -312,7 +312,7 @@ Each value in this layout consists of 0 or more bytes. While primitive arrays have a single values buffer, variable-size binary have an **offsets** buffer and **data** buffer. -The offsets buffer contains `length + 1` signed integers (either +The offsets buffer contains ``length + 1`` signed integers (either 32-bit or 64-bit, depending on the logical type), which encode the start position of each slot in the data buffer. The length of the value in each slot is computed using the difference between the offset @@ -374,7 +374,7 @@ locations are indicated using a **views** buffer, which may point to one of potentially several **data** buffers or may contain the characters inline. -The views buffer contains `length` view structures with the following layout: +The views buffer contains ``length`` view structures with the following layout: :: @@ -394,7 +394,7 @@ should be interpreted. In the short string case the string's bytes are inlined — stored inside the view itself, in the twelve bytes which follow the length. Any remaining bytes -after the string itself are padded with `0`. +after the string itself are padded with ``0``. In the long string case, a buffer index indicates which data buffer stores the data bytes and an offset indicates where in that buffer the @@ -1108,6 +1108,8 @@ includes a serialized Flatbuffer type along with an optional message body. We define this message format before describing how to serialize each constituent IPC message type. +.. _ipc-message-format: + Encapsulated message format --------------------------- diff --git a/docs/source/format/DissociatedIPC.rst b/docs/source/format/DissociatedIPC.rst new file mode 100644 index 0000000000000..0b0861399cb2f --- /dev/null +++ b/docs/source/format/DissociatedIPC.rst @@ -0,0 +1,403 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +.. _dissociated-ipc: + +======================== +Dissociated IPC Protocol +======================== + +.. warning:: + + Experimental: The Dissociated IPC Protocol is experimental in its current + form. Based on feedback and usage the protocol definition may change until + it is fully standardized. + +Rationale +========= + +The :ref:`Arrow IPC format ` describes a protocol for transferring +Arrow data as a stream of record batches. This protocol expects a continuous +stream of bytes divided into discrete messages (using a length prefix and +continuation indicator). Each discrete message consists of two portions: + +* A `Flatbuffers`_ header message +* A series of bytes consisting of the flattened and packed body buffers (some + message types, like Schema messages, do not have this section) + - This is referred to as the *message body* in the IPC format spec. + +For most cases, the existing IPC format as it currently exists is sufficiently efficient: + +* Receiving data in the IPC format allows zero-copy utilization of the body + buffer bytes, no deserialization is required to form Arrow Arrays +* An IPC file format can be memory-mapped because it is location agnostic + and the bytes of the file are exactly what is expected in memory. + +However, there are use cases that aren't handled by this: + +* Constructing the IPC record batch message requires allocating a contiguous + chunk of bytes and copying all of the data buffers into it, packed together + back-to-back. This pessimizes the common case of wrapping existing, directly + consumable data into an IPC message. +* Even if Arrow data is located in a memory accessible across process boundaries + or transports (such as UCX), there is no standard way to specify that shared + location to consumers which could take advantage of it. +* Arrow data located on a non-CPU device (such as a GPU) cannot be sent using + Arrow IPC without having to copy the data back to the host device or copying + the Flatbuffers metadata bytes into device memory. + + * By the same token, receiving IPC messages into device memory would require + performing a copy of the Flatbuffers metadata back to the host CPU device. This + is due to the fact that the IPC stream interleaves data and metadata across a + single stream. + +This protocol attempts to solve these use cases in an efficient manner. + +Goals +----- + +* Define a generic protocol for passing Arrow IPC data, not tied to any particular + transport, that also allows for utilizing non-CPU device memory, shared memory, and + newer "high performance" transports such as `UCX`_ or `libfabric`_. + + * This allows for the data in the body to be kept on non-CPU devices (like GPUs) + without expensive device-to-host copies. + +* Allow for using :ref:`Flight RPC ` purely for control flow by separating + the stream of IPC metadata from IPC body bytes + +Definitions +----------- + +IPC Metadata + The Flatbuffers message bytes that encompass the header of an Arrow IPC message + +Tag + A little-endian ``uint64`` value used for flow control and used in determining + how to interpret the body of a message. Specific bits can be masked to allow + identifying messages by only a portion of the tag, leaving the rest of the bits + to be used for control flow or other message metadata. Some transports, such as + UCX, have built-in support for such tag values and will provide them in CPU + memory regardless of whether or not the body of the message may reside on a + non-CPU device. + +Sequence Number + A little-endian, 4-byte unsigned integer starting at 0 for a stream, indicating + the sequence order of messages. It is also used to identify specific messages to + tie the IPC metadata header to its corresponding body since the metadata and body + can be sent across separate pipes/streams/transports. + + If a sequence number reaches ``UINT32_MAX``, it should be allowed to roll over as + it is unlikely there would be enough unprocessed messages waiting to be processed + that would cause an overlap of sequence numbers. + + The sequence number serves two purposes: To identify corresponding metadata and + tagged body data messages and to ensure we do not rely on messages having to arrive + in order. A client should use the sequence number to correctly order messages as + they arrive for processing. + +The Protocol +============ + +A reference example implementation utilizing `libcudf`_ and `UCX`_ can be found in the +`arrow-experiments repo `_. + +Requirements +------------ + +A transport implementing this protocol **MUST** provide two pieces of functionality: + +* Message sending + + * Delimited messages (like gRPC) as opposed to non-delimited streams (like plain TCP + without further framing). + + * Alternatively, a framing mechanism like the :ref:`encapsulated message format ` + for the IPC protocol can be used while leaving out the body bytes. + +* Tagged message sending + + * Sending a message that has an attached little-endian, unsigned 64-bit integral tag + for control flow. A tag like this allows control flow to operate on a message whose body + is on a non-CPU device without requiring the message itself to get copied off of the device. + +URI Specification +----------------- + +When providing a URI to a consumer to contact for use with this protocol (such as via +the :ref:`Location URI for Flight `), the URI should specify a scheme +like *ucx:* or *fabric:*, that is easily identifiable. In addition, the URI should +encode the following URI query parameters: + +.. note:: + As this protocol matures, this document will get updated with commonly recognized + transport schemes that get used with it. + +* ``want_data`` - **REQUIRED** - uint64 integer value + + * This value should be used to tag an initial message to the server to initiate a + data transfer. The body of the initiating message should be an opaque binary identifier + of the data stream being requested (like the ``Ticket`` in the Flight RPC protocol) + +* ``free_data`` - **OPTIONAL** - uint64 integer value + + * If the server might send messages using offsets / addresses for remote memory accessing + or shared memory locations, the URI should include this parameter. This value is used to + tag messages sent from the client to the data server, containing specific offsets / addresses + which were provided that are no longer required by the client (i.e. any operations that + directly reference those memory locations, such as copying the remote data into local memory, + have been completed). + +* ``remote_handle`` - **OPTIONAL** - base64-encoded string + + * When working with shared memory or remote memory, this value indicates any required + handle or identifier that is necessary for accessing the memory. + + * Using UCX, this would be an *rkey* value + + * With CUDA IPC, this would be the value of the base GPU pointer or memory handle, + and subsequent addresses would be offsets from this base pointer. + +Handling of Backpressure +------------------------ + +*Currently* this proposal does not specify any way to manage the backpressure of +messages to throttle for memory and bandwidth reasons. For now, this will be +**transport-defined** rather than lock into something sub-optimal. + +As usage among different transports and libraries grows, common patterns will emerge +that will allow for a generic, but efficient, way to handle backpressure across +different use cases. + +.. note:: + While the protocol itself is transport agnostic, the current usage and examples + only have been tested using UCX and libfabric transports so far, but that's all. + + +Protocol Description +==================== + +There are two possibilities that can occur: + +1. The streams of metadata and body data are sent across separate connections + +.. mermaid:: ./DissociatedIPC/SequenceDiagramSeparate.mmd + + +2. The streams of metadata and body data are sent simultaneously across the + same connection + +.. mermaid:: ./DissociatedIPC/SequenceDiagramSame.mmd + +Server Sequence +--------------- + +There can be either a single server handling both the IPC Metadata stream and the +Body data streams, or separate servers for handling the IPC Metadata and the body +data. This allows for streaming of data across either a single transport pipe or +two pipes if desired. + +Metadata Stream Sequence +'''''''''''''''''''''''' + +The standing state of the server is waiting for a **tagged** message with a specific +```` tag value to initiate a transfer. This ```` value is defined +by the server and propagated to any clients via the URI they are provided. This protocol +does not prescribe any particular value so that it will not interfere with any other +existing protocols that rely on tag values. The body of that message will contain an +opaque, binary identifier to indicate a particular dataset / data stream to send. + +.. note:: + + For instance, the **ticket** that was passed with a *FlightInfo* message would be + the body of this message. Because it is opaque, it can be anything the server wants + to use. The URI and identifier do not need to be given to the client via Flight RPC, + but could come across from any transport or protocol desired. + +Upon receiving a ```` request, the server *should* respond by sending a stream +of messages consisting of the following: + +.. mermaid:: + + block-beta + columns 8 + + block:P["\n\n\n\nPrefix"]:5 + T["Message type\nByte 0"] + S["Sequence number\nBytes 1-4"] + end + H["Flatbuffer bytes\nRest of the message"]:3 + +* A 5-byte prefix + + - The first byte of the message indicates the type of message, currently there are only + two allowed message types (more types may get added in the future): + + 0) End of Stream + 1) Flatbuffers IPC Metadata Message + + - the next 4-bytes are a little-endian, unsigned 32-bit integer indicating the sequence number of + the message. The first message in the stream (**MUST** always be a schema message) **MUST** + have a sequence number of ``0``. Each subsequent message **MUST** increment the number by + ``1``. + +* The full Flatbuffers bytes of an Arrow IPC header + +As defined in the Arrow IPC format, each metadata message can represent a chunk of data or +dictionaries for use by the stream of data. + +After sending the last metadata message, the server **MUST** indicate the end of the stream +by sending a message consisting of **exactly** 5 bytes: + +* The first byte is ``0``, indicating an **End of Stream** message +* The last 4 bytes are the sequence number (4-byte, unsigned integer in little-endian byte order) + +Data Stream Sequence +'''''''''''''''''''' + +If a single server is handling both the data and metadata streams, then the data messages +**should** begin being sent to the client in parallel with the metadata messages. Otherwise, +as with the metadata sequence, the standing state of the server is to wait for a **tagged** +message with the ```` tag value, whose body indicates the dataset / data stream +to send to the client. + +For each IPC message in the stream of data, a **tagged** message **MUST** be sent on the data +stream if that message has a body (i.e. a Record Batch or Dictionary message). The +:term:`tag ` for each message should be structured as follows: + +.. mermaid:: + + block-beta + columns 8 + + S["Sequence number\nBytes 0-3"]:4 + U["Unused (Reserved)\nBytes 4-6"]:3 + T["Message type\nByte 7"]:1 + +* The *least significant* 4-bytes (bits 0 - 31) of the tag should be the unsigned 32-bit, little-endian sequence + number of the message. +* The *most significant* byte (bits 56 - 63) of the tag indicates the message body **type** as an 8-bit + unsigned integer. Currently only two message types are specified, but more can be added as + needed to expand the protocol: + + 0) The body contains the raw body buffer bytes as a packed buffer (i.e. the standard IPC + format body bytes) + 1) The body contains a series of unsigned, little-endian 64-bit integer pairs to represent + either shared or remote memory, schematically structured as + + * The first two integers (e.g. the first 16 bytes) represent the *total* size (in bytes) + of all buffers and the number of buffers in this message (and thus the number of following + pairs of ``uint64``) + + * Each subsequent pair of ``uint64`` values are an address / offset followed the length of + that particular buffer. + +* All unspecified bits (bits 32 - 55) of the tag are *reserved* for future use by potential updates + to this protocol. For now they **MUST** be 0. + +.. note:: + + Any shared/remote memory addresses that are sent across **MUST** be kept alive by the server + until a corresponding tagged ```` message is received. If the client disconnects + before sending any ```` messages, it can be assumed to be safe to clean up the memory + if desired by the server. + +After sending the last tagged IPC body message, the server should maintain the connection and wait +for tagged ```` messages. The structure of these ```` messages is simple: +one or more unsigned, little-endian 64-bit integers which indicate the addresses/offsets that can +be freed. + +Once there are no more outstanding addresses to be freed, the work for this stream is complete. + +Client Sequence +--------------- + +A client for this protocol needs to concurrently handle both the data and metadata streams of +messages which may either both come from the same server or different servers. Below is a flowchart +showing how a client might handle the metadata and data streams: + +.. mermaid:: ./DissociatedIPC/ClientFlowchart.mmd + +#. First the client sends a tagged message using the ```` value it was provided in the + URI as the tag, and the opaque ID as the body. + + * If the metadata and data servers are separate, then a ```` message needs to be sent + separately to each. + * In either scenario, the metadata and data streams can be processed concurrently and/or asynchronously + depending on the nature of the transports. + +#. For each **untagged** message the client receives in the metadata stream: + + * The first byte of the message indicates whether it is an *End of Stream* message (value ``0``) + or a metadata message (value ``1``). + * The next 4 bytes are the sequence number of the message, an unsigned 32-bit integer in + little-endian byte order. + * If it is **not** an *End of Stream* message, the remaining bytes are the IPC Flatbuffer bytes which + can be interpreted as normal. + + * If the message has a body (i.e. Record Batch or Dictionary message) then the client should retrieve + a tagged message from the Data Stream using the same sequence number. + + * If it **is** an *End of Stream* message, then it is safe to close the metadata connection if there are + no gaps in the sequence numbers received. + +#. When a metadata message that requires a body is received, the tag mask of ``0x00000000FFFFFFFF`` **should** + be used alongside the sequence number to match the message regardless of the higher bytes (e.g. we only + care about matching the lower 4 bytes to the sequence number) + + * Once recieved, the Most Significant Byte's value determines how the client processes the body data: + + * If the most significant byte is 0: Then the body of the message is the raw IPC packed body buffers + allowing it to easily be processed with the corresponding metadata header bytes. + + * If the most significant byte is 1: The body of the message will consist of a series of pairs of + unsigned, 64-bit integers in little-endian byte order. + + * The first two integers represent *1)* the total size of all the body buffers together to allow + for easy allocation if an intermediate buffer is needed and *2)* the number of buffers being sent (``nbuf``). + + * The rest of the message will be ``nbuf`` pairs of integers, one for each buffer. Each pair is + *1)* the address / offset of the buffer and *2)* the length of that buffer. Memory can then be retrieved + via shared or remote memory routines based on the underlying transport. These addresses / offsets **MUST** + be retained so they can be sent back in ```` messages later, indicating to the server that + the client no longer needs the shared memory. + +#. Once an *End of Stream* message is received, the client should process any remaining un-processed + IPC metadata messages. + +#. After individual memory addresses / offsets are able to be freed by the remote server (in the case where + it has sent these rather than the full body bytes), the client should send corresponding ```` messages + to the server. + + * A single ```` message consists of an arbitrary number of unsigned 64-bit integer values, representing + the addresses / offsets which can be freed. The reason for it being an *arbitrary number* is to allow a client + to choose whether to send multiple messages to free multiple addresses or to coalesce multiple addresses into + fewer messages to be freed (thus making the protocol less "chatty" if desired) + +Continuing Development +====================== + +If you decide to try this protocol in your own environments and system, we'd love feedback and to learn about +your use case. As this is an **experimental** protocol currently, we need real-world usage in order to facilitate +improving it and finding the right generalizations to standardize on across transports. + +Please chime in using the Arrow Developers Mailing list: https://arrow.apache.org/community/#mailing-lists + +.. _Flatbuffers: http://github.com/google/flatbuffers +.. _UCX: https://openucx.org/ +.. _libfabric: https://ofiwg.github.io/libfabric/ +.. _libcudf: https://docs.rapids.ai/api diff --git a/docs/source/format/DissociatedIPC/ClientFlowchart.mmd b/docs/source/format/DissociatedIPC/ClientFlowchart.mmd new file mode 100644 index 0000000000000..652cabc1c7425 --- /dev/null +++ b/docs/source/format/DissociatedIPC/ClientFlowchart.mmd @@ -0,0 +1,37 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at + +%% http://www.apache.org/licenses/LICENSE-2.0 + +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +graph LR +client((Client))-->c1{{Send #60;want_data#gt; Msg}} +subgraph meta [Meta Message] + direction LR + m1[/Msg Type #40;byte 0#41;
Seq Num #40;bytes 1-5#41;/]-- type 1 -->m2[[Process IPC Header]] + m2-- IPC has body -->m3[Get Corresponding
Tagged Msg] + m2-- Schema Msg -->m4[/Store Schema/] + m1-- type 0 -->e[Indicate End of Stream] +end +subgraph data [Data Stream] + direction LR + d1[Request Msg
for Seq Num]-->d2{Most Significant
Byte} + d2-- 0 -->d3[Construct from
Metadata and Body] + d2-- 1 -->d4[Get shared/remote
buffers] + d4 -->d5[Construct from
Metadata and buffers] + d3 & d5 -->e2[Output Batch] +end + +client -- recv untagged msg --> meta +client -- get tagged msg --> data diff --git a/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd b/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd new file mode 100644 index 0000000000000..adf26bdc32767 --- /dev/null +++ b/docs/source/format/DissociatedIPC/SequenceDiagramSame.mmd @@ -0,0 +1,43 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +sequenceDiagram + participant C as Client + participant S as Server + activate C + C-->>+S: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + S-->>C: Message(bytes([1]) + le_bytes(sequence_number) + schema_metadata) + par + loop each chunk + S-->>C: Message(bytes([1]) + le_bytes(sequence_number) + batch_metadata) + end + S-->>C: Message(bytes([0]) + le_bytes(sequence_number)) + and + loop each chunk + alt + S-->>C: TaggedMessage((bytes[0] << 55) | le_bytes(sequence_number),
bytes=batch_data) + else + S-->>C: TaggedMessage((bytes[1] << 55) | le_bytes(sequence_number),
bytes=uint64_pairs) + end + end + end + + loop + C-->>S: TaggedMessage(server.free_data, bytes=uint64_list) + end + deactivate S + deactivate C diff --git a/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd b/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd new file mode 100644 index 0000000000000..11d2d9d6387eb --- /dev/null +++ b/docs/source/format/DissociatedIPC/SequenceDiagramSeparate.mmd @@ -0,0 +1,44 @@ +%% Licensed to the Apache Software Foundation (ASF) under one +%% or more contributor license agreements. See the NOTICE file +%% distributed with this work for additional information +%% regarding copyright ownership. The ASF licenses this file +%% to you under the Apache License, Version 2.0 (the +%% "License"); you may not use this file except in compliance +%% with the License. You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either express or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. + +sequenceDiagram + participant D as Data Stream + participant C as Client + participant M as Metadata Stream + + activate C + C-->>+M: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + C-->>+D: TaggedMessage(server.want_data, bytes=ID_of_desired_data) + M-->>C: Message(bytes([1]) + le_bytes(sequence_number) + schema_metadata) + loop each batch + par + M-->>C: Message(bytes([1]) + le_bytes(sequence_number) + batch_metadata) + and + alt + D-->>C: TaggedMessage((bytes[0] << 55) | le_bytes(sequence_number),
bytes=batch_data) + else + D-->>C: TaggedMessage((bytes[1] << 55) | le_bytes(sequence_number),
bytes=uint64_pairs) + end + end + end + M-->>C: Message(bytes([0]) + le_bytes(sequence_number)) + deactivate M + loop + C-->>D: TaggedMessage(server.free_data, bytes=uint64_list) + end + deactivate D + deactivate C diff --git a/docs/source/format/Flight.rst b/docs/source/format/Flight.rst index 7ee84952b4350..2c5487d857ea4 100644 --- a/docs/source/format/Flight.rst +++ b/docs/source/format/Flight.rst @@ -68,9 +68,8 @@ Downloading Data A client that wishes to download the data would: -.. figure:: ./Flight/DoGet.mmd.svg - - Retrieving data via ``DoGet``. +.. mermaid:: ./Flight/DoGet.mmd + :caption: Retrieving data via ``DoGet``. #. Construct or acquire a ``FlightDescriptor`` for the data set they are interested in. @@ -168,9 +167,8 @@ data. However, ``GetFlightInfo`` doesn't return until the query completes, so the client is blocked. In this situation, the client can use ``PollFlightInfo`` instead of ``GetFlightInfo``: -.. figure:: ./Flight/PollFlightInfo.mmd.svg - - Polling a long-running query by ``PollFlightInfo``. +.. mermaid:: ./Flight/PollFlightInfo.mmd + :caption: Polling a long-running query by ``PollFlightInfo``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``PollFlightInfo(FlightDescriptor)`` to get a ``PollInfo`` @@ -229,9 +227,8 @@ Uploading Data To upload data, a client would: -.. figure:: ./Flight/DoPut.mmd.svg - - Uploading data via ``DoPut``. +.. mermaid:: ./Flight/DoPut.mmd + :caption: Uploading data via ``DoPut``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoPut(FlightData)`` and upload a stream of Arrow record @@ -257,9 +254,8 @@ require being stateful if implemented using ``DoGet`` and ``DoPut``. Instead, ``DoExchange`` allows this to be implemented as a single call. A client would: -.. figure:: ./Flight/DoExchange.mmd.svg - - Complex data flow with ``DoExchange``. +.. mermaid:: ./Flight/DoExchange.mmd + :caption: Complex data flow with ``DoExchange``. #. Construct or acquire a ``FlightDescriptor``, as before. #. Call ``DoExchange(FlightData)``. @@ -314,6 +310,8 @@ well, in which case any `authentication method supported by gRPC .. _Mutual TLS (mTLS): https://grpc.io/docs/guides/auth/#supported-auth-mechanisms +.. _flight-location-uris: + Location URIs ============= diff --git a/docs/source/format/Flight/DoExchange.mmd b/docs/source/format/Flight/DoExchange.mmd index 14f1789aeaaa7..f7586bf35eb4f 100644 --- a/docs/source/format/Flight/DoExchange.mmd +++ b/docs/source/format/Flight/DoExchange.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoExchange.mmd.svg b/docs/source/format/Flight/DoExchange.mmd.svg deleted file mode 100644 index 204d63d77218d..0000000000000 --- a/docs/source/format/Flight/DoExchange.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoExchange(FlightData)1stream of FlightData2stream of FlightData3par[[Client sends data]][[Server sends data]]ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/DoGet.mmd b/docs/source/format/Flight/DoGet.mmd index c2e3cd034448c..cac59afb8219f 100644 --- a/docs/source/format/Flight/DoGet.mmd +++ b/docs/source/format/Flight/DoGet.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoGet.mmd.svg b/docs/source/format/Flight/DoGet.mmd.svg deleted file mode 100644 index 48a50d77ed33f..0000000000000 --- a/docs/source/format/Flight/DoGet.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerGetFlightInfo(FlightDescriptor)1FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}2This may be parallelizedDoGet(Ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/Flight/DoPut.mmd b/docs/source/format/Flight/DoPut.mmd index 5845edef1f466..876505da2d300 100644 --- a/docs/source/format/Flight/DoPut.mmd +++ b/docs/source/format/Flight/DoPut.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/DoPut.mmd.svg b/docs/source/format/Flight/DoPut.mmd.svg deleted file mode 100644 index 9e490e152bdb3..0000000000000 --- a/docs/source/format/Flight/DoPut.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerThe first FlightData includes a FlightDescriptorDoPut(FlightData)1stream of FlightData2PutResult{app_metadata}3ClientServer \ No newline at end of file diff --git a/docs/source/format/Flight/PollFlightInfo.mmd b/docs/source/format/Flight/PollFlightInfo.mmd index d062a3a216958..f91c077b655c0 100644 --- a/docs/source/format/Flight/PollFlightInfo.mmd +++ b/docs/source/format/Flight/PollFlightInfo.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd):/data minlag/mermaid-cli -i /data/PollFlightInfo.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/Flight/PollFlightInfo.mmd.svg b/docs/source/format/Flight/PollFlightInfo.mmd.svg deleted file mode 100644 index 1890361f88ce4..0000000000000 --- a/docs/source/format/Flight/PollFlightInfo.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientMetadata ServerData ServerThis may be parallelizedSome endpoints may be processed while pollingloop[for each endpoint in FlightInfo.endpoints]PollFlightInfo(FlightDescriptor)1PollInfo{descriptor: FlightDescriptor', ...}2PollFlightInfo(FlightDescriptor')3PollInfo{descriptor: FlightDescriptor'', ...}4PollFlightInfo(FlightDescriptor'')5PollInfo{descriptor: null, info: FlightInfo{endpoints: [FlightEndpoint{ticket: Ticket}, …]}6DoGet(Ticket)7stream of FlightData8ClientMetadata ServerData Server \ No newline at end of file diff --git a/docs/source/format/FlightSql.rst b/docs/source/format/FlightSql.rst index 1a43e4bdff306..b4b85e77a2e5f 100644 --- a/docs/source/format/FlightSql.rst +++ b/docs/source/format/FlightSql.rst @@ -32,9 +32,6 @@ with any database that supports the necessary endpoints. Flight SQL clients wrap the underlying Flight client to provide methods for the new RPC methods described here. -.. warning:: Flight SQL is **experimental** and changes to the - protocol may still be made. - RPC Methods =========== @@ -196,7 +193,7 @@ in the ``app_metadata`` field of the Flight RPC ``PutResult`` returned. When used with DoPut: load the stream of Arrow record batches into the specified target table and return the number of rows ingested - via a `DoPutUpdateResult` message. + via a ``DoPutUpdateResult`` message. Flight Server Session Management -------------------------------- @@ -242,21 +239,17 @@ Close and invalidate the current session context. Sequence Diagrams ================= -.. figure:: ./FlightSql/CommandGetTables.mmd.svg - - Listing available tables. - -.. figure:: ./FlightSql/CommandStatementQuery.mmd.svg - - Executing an ad-hoc query. - -.. figure:: ./FlightSql/CommandPreparedStatementQuery.mmd.svg +.. mermaid:: ./FlightSql/CommandGetTables.mmd + :caption: Listing available tables. - Creating a prepared statement, then executing it. +.. mermaid:: ./FlightSql/CommandStatementQuery.mmd + :caption: Executing an ad-hoc query. -.. figure:: ./FlightSql/CommandStatementIngest.mmd.svg +.. mermaid:: ./FlightSql/CommandPreparedStatementQuery.mmd + :caption: Creating a prepared statement, then executing it. - Executing a bulk ingestion. +.. mermaid:: ./FlightSql/CommandStatementIngest.mmd + :caption: Executing a bulk ingestion. External Resources ================== diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd b/docs/source/format/FlightSql/CommandGetTables.mmd index f151411647f23..e6b18ed7dc08b 100644 --- a/docs/source/format/FlightSql/CommandGetTables.mmd +++ b/docs/source/format/FlightSql/CommandGetTables.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandGetTables.mmd.svg b/docs/source/format/FlightSql/CommandGetTables.mmd.svg deleted file mode 100644 index 4e71c01982289..0000000000000 --- a/docs/source/format/FlightSql/CommandGetTables.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandGetTables)1FlightInfo{…Ticket…}2DoGet(Ticket)3stream of FlightData4ClientServer \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd index cbd1eb6014bca..ce18b91eaa33e 100644 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandPreparedStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg deleted file mode 100644 index cbf6a78e9a5ce..0000000000000 --- a/docs/source/format/FlightSql/CommandPreparedStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientoptional response with updated handleloop[for each endpoint in FlightInfo.endpoints]loop[for each invocation of the prepared statement]DoAction(ActionCreatePreparedStatementRequest)1ActionCreatePreparedStatementResult{handle}2DoPut(CommandPreparedStatementQuery)3stream of FlightData4DoPutPreparedStatementResult{handle}5GetFlightInfo(CommandPreparedStatementQuery)6FlightInfo{endpoints: [FlightEndpoint{…}, …]}7DoGet(endpoint.ticket)8stream of FlightData9DoAction(ActionClosePreparedStatementRequest)10ActionClosePreparedStatementRequest{}11 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd b/docs/source/format/FlightSql/CommandStatementIngest.mmd index 781289d77b41a..0578f465d4dda 100644 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd +++ b/docs/source/format/FlightSql/CommandStatementIngest.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandGetTables.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg b/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg deleted file mode 100644 index e2aa72459afa5..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementIngest.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ServerClientServerClientDoPut(CommandStatementIngest)1stream of FlightData2PutResult{DoPutUpdateResult{RecordCount: int64}}3 \ No newline at end of file diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd b/docs/source/format/FlightSql/CommandStatementQuery.mmd index 7b67fecfb75c6..f26aa2f951fcf 100644 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd +++ b/docs/source/format/FlightSql/CommandStatementQuery.mmd @@ -15,9 +15,6 @@ %% specific language governing permissions and limitations %% under the License. -%% To generate the diagram, use mermaid-cli -%% Example: docker run --rm -v $(pwd)/FlightSql:/data minlag/mermaid-cli -i /data/CommandStatementQuery.mmd - sequenceDiagram autonumber diff --git a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg b/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg deleted file mode 100644 index f5e8c79f137ff..0000000000000 --- a/docs/source/format/FlightSql/CommandStatementQuery.mmd.svg +++ /dev/null @@ -1 +0,0 @@ -ClientServerGetFlightInfo(CommandStatementQuery)1FlightInfo{endpoints: [FlightEndpoint{…}, …]}2DoGet(endpoint.ticket)3stream of FlightData4loop[for each endpoint in FlightInfo.endpoints]ClientServer \ No newline at end of file diff --git a/docs/source/format/Glossary.rst b/docs/source/format/Glossary.rst index 3f2f118a95d6d..11c19c5fa70e9 100644 --- a/docs/source/format/Glossary.rst +++ b/docs/source/format/Glossary.rst @@ -211,7 +211,7 @@ Glossary its bindings, and Go). .. image:: ../cpp/tables-versus-record-batches.svg - :alt: A graphical representation of an Arrow Table and a + :alt: A graphical representation of an Arrow Table and a Record Batch, with structure as described in text above. .. seealso:: :term:`chunked array`, :term:`record batch` diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 1a9b1b97f07ee..436747989acf3 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -501,14 +501,14 @@ integration testing actually tests. There are two types of integration test cases: the ones populated on the fly by the data generator in the Archery utility, and *gold* files that exist -in the `arrow-testing ` +in the `arrow-testing `_ repository. Data Generator Tests ~~~~~~~~~~~~~~~~~~~~ This is the high-level description of the cases which are generated and -tested using the ``archery integration`` command (see ``get_generated_json_files`` +tested using the ``archery integration`` command (see ``get_generated_json_files`` in ``datagen.py``): * Primitive Types @@ -549,7 +549,7 @@ Gold File Integration Tests Pre-generated json and arrow IPC files (both file and stream format) exist in the `arrow-testing `__ repository in the ``data/arrow-ipc-stream/integration`` directory. These serve as -*gold* files that are assumed to be correct for use in testing. They are +*gold* files that are assumed to be correct for use in testing. They are referenced by ``runner.py`` in the code for the :ref:`Archery ` utility. Below are the test cases which are covered by them: @@ -563,7 +563,7 @@ utility. Below are the test cases which are covered by them: + intervals + maps + nested types (list, struct) - + primitives + + primitives + primitive with no batches + primitive with zero length batches diff --git a/docs/source/format/index.rst b/docs/source/format/index.rst index 856830d863243..44ea3e8e7e608 100644 --- a/docs/source/format/index.rst +++ b/docs/source/format/index.rst @@ -30,6 +30,7 @@ Specifications CDataInterface CStreamInterface CDeviceDataInterface + DissociatedIPC Flight FlightSql ADBC diff --git a/docs/source/java/algorithm.rst b/docs/source/java/algorithm.rst index 316fd38fa0990..d4838967d614f 100644 --- a/docs/source/java/algorithm.rst +++ b/docs/source/java/algorithm.rst @@ -20,12 +20,12 @@ Java Algorithms Arrow's Java library provides algorithms for some commonly-used functionalities. The algorithms are provided in the ``org.apache.arrow.algorithm`` -package of the ``algorithm`` module. +package of the ``algorithm`` module. Comparing Vector Elements ------------------------- -Comparing vector elements is the basic for many algorithms. Vector +Comparing vector elements is the basic for many algorithms. Vector elements can be compared in one of the two ways: 1. **Equality comparison**: there are two possible results for this type of comparisons: ``equal`` and ``unequal``. @@ -36,30 +36,30 @@ interface. and ``greater than``. This comparison is supported by the abstract class ``org.apache.arrow.algorithm.sort.VectorValueComparator``. We provide default implementations to compare vector elements. However, users can also define ways -for customized comparisons. +for customized comparisons. Vector Element Search --------------------- -A search algorithm tries to find a particular value in a vector. When successful, a vector index is +A search algorithm tries to find a particular value in a vector. When successful, a vector index is returned; otherwise, a ``-1`` is returned. The following search algorithms are provided: -1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is +1. **Linear search**: this algorithm simply traverses the vector from the beginning, until a match is found, or the end of the vector is reached. So it takes ``O(n)`` time, where ``n`` is the number of elements in the vector. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#linearSearch``. -2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. +2. **Binary search**: this represents a more efficient search algorithm, as it runs in ``O(log(n))`` time. However, it is only applicable to sorted vectors. To get a sorted vector, one can use one of our sorting algorithms, which will be discussed in the next section. This algorithm is implemented in ``org.apache.arrow.algorithm.search.VectorSearcher#binarySearch``. 3. **Parallel search**: when the vector is large, it takes a long time to traverse the elements to search -for a value. To make this process faster, one can split the vector into multiple partitions, and perform the +for a value. To make this process faster, one can split the vector into multiple partitions, and perform the search for each partition in parallel. This is supported by ``org.apache.arrow.algorithm.search.ParallelSearcher``. -4. **Range search**: for many scenarios, there can be multiple matching values in the vector. +4. **Range search**: for many scenarios, there can be multiple matching values in the vector. If the vector is sorted, the matching values reside in a contiguous region in the vector. The -range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. +range search algorithm tries to find the upper/lower bound of the region in ``O(log(n))`` time. An implementation is provided in ``org.apache.arrow.algorithm.search.VectorRangeSearcher``. Vector Sorting @@ -72,19 +72,19 @@ classified into the following categories: 1. **In-place sorter**: an in-place sorter performs the sorting by manipulating the original vector, without creating any new vector. So it just returns the original vector after the sorting operations. Currently, we have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter`` for in-place -sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. +sorting in ``O(nlog(n))`` time. As the name suggests, it only supports fixed width vectors. 2. **Out-of-place sorter**: an out-of-place sorter does not mutate the original vector. Instead, it copies vector elements to a new vector in sorted order, and returns the new vector. -We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` +We have ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.FixedWidthOutOfPlaceVectorSorter`` and ``org.apache.arrow.algorithm.sort.FixedWidthInPlaceVectorSorter.VariableWidthOutOfPlaceVectorSorter`` -for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. +for fixed width and variable width vectors, respectively. Both algorithms run in ``O(nlog(n))`` time. 3. **Index sorter**: this sorter does not actually sort the vector. Instead, it returns an integer vector, which correspond to indices of vector elements in sorted order. With the index vector, one can -easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k``th -smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, -which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. +easily construct a sorted vector. In addition, some other tasks can be easily achieved, like finding the ``k`` th +smallest value in the vector. Index sorting is supported by ``org.apache.arrow.algorithm.sort.IndexSorter``, +which runs in ``O(nlog(n))`` time. It is applicable to vectors of any type. Other Algorithms ---------------- diff --git a/docs/source/java/flight.rst b/docs/source/java/flight.rst index e009998be4f4e..6d26583aeefa6 100644 --- a/docs/source/java/flight.rst +++ b/docs/source/java/flight.rst @@ -184,7 +184,7 @@ Handshake-based authentication can be enabled by implementing ``ServerAuthHandler``. Authentication consists of two parts: on initial client connection, the server and client authentication implementations can perform any negotiation needed. The client authentication -handler then provides a token that will be attached to future calls. +handler then provides a token that will be attached to future calls. The client send data to be validated through ``ClientAuthHandler.authenticate`` The server validate data received through ``ServerAuthHandler.authenticate``. diff --git a/docs/source/java/flight_sql_jdbc_driver.rst b/docs/source/java/flight_sql_jdbc_driver.rst index 0ace2185983a9..f95c2ac755d97 100644 --- a/docs/source/java/flight_sql_jdbc_driver.rst +++ b/docs/source/java/flight_sql_jdbc_driver.rst @@ -162,15 +162,15 @@ the Flight SQL service as gRPC headers. For example, the following URI :: This will connect without authentication or encryption, to a Flight SQL service running on ``localhost`` on port 12345. Each request will -also include a `database=mydb` gRPC header. +also include a ``database=mydb`` gRPC header. Connection parameters may also be supplied using the Properties object when using the JDBC Driver Manager to connect. When supplying using the Properties object, values should *not* be URI-encoded. Parameters specified by the URI supercede parameters supplied by the -Properties object. When calling the `user/password overload of -DriverManager#getConnection() +Properties object. When calling the `user/password overload of +DriverManager#getConnection() `_, the username and password supplied on the URI supercede the username and password arguments to the function call. diff --git a/docs/source/java/install.rst b/docs/source/java/install.rst index a551edc36c477..dc6a55c87fcd6 100644 --- a/docs/source/java/install.rst +++ b/docs/source/java/install.rst @@ -63,7 +63,7 @@ Modifying the command above for Flight: Otherwise, you may see errors like ``java.lang.IllegalAccessError: superclass access check failed: class org.apache.arrow.flight.ArrowMessage$ArrowBufRetainingCompositeByteBuf (in module org.apache.arrow.flight.core) cannot access class io.netty.buffer.CompositeByteBuf (in unnamed module ...) because module -org.apache.arrow.flight.core does not read unnamed module ... +org.apache.arrow.flight.core does not read unnamed module ...`` Finally, if you are using arrow-dataset, you'll also need to report that JDK internals need to be exposed. Modifying the command above for arrow-memory: diff --git a/docs/source/java/ipc.rst b/docs/source/java/ipc.rst index 01341ff2cc391..f5939179177d5 100644 --- a/docs/source/java/ipc.rst +++ b/docs/source/java/ipc.rst @@ -81,7 +81,7 @@ Here we used an in-memory stream, but this could have been a socket or some othe writer.end(); Note that, since the :class:`VectorSchemaRoot` in the writer is a container that can hold batches, batches flow through -:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before `writeBatch`, so that later batches +:class:`VectorSchemaRoot` as part of a pipeline, so we need to populate data before ``writeBatch``, so that later batches could overwrite previous ones. Now the :class:`ByteArrayOutputStream` contains the complete stream which contains 5 record batches. diff --git a/docs/source/java/memory.rst b/docs/source/java/memory.rst index 036befa148692..8014a27444ac9 100644 --- a/docs/source/java/memory.rst +++ b/docs/source/java/memory.rst @@ -20,7 +20,7 @@ Memory Management ================= The memory modules contain all the functionality that Arrow uses to allocate and deallocate memory. This document is divided in two parts: -The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. +The first part, *Memory Basics*, provides a high-level introduction. The following section, *Arrow Memory In-Depth*, fills in the details. .. contents:: @@ -39,7 +39,7 @@ Getting Started Arrow's memory management is built around the needs of the columnar format and using off-heap memory. Arrow Java has its own independent implementation. It does not wrap the C++ implementation, although the framework is flexible enough -to be used with memory allocated in C++ that is used by Java code. +to be used with memory allocated in C++ that is used by Java code. Arrow provides multiple modules: the core interfaces, and implementations of the interfaces. Users need the core interfaces, and exactly one of the implementations. @@ -67,9 +67,9 @@ Why Arrow Uses Direct Memory BufferAllocator --------------- -The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). -As the name suggests, it can allocate new buffers associated with itself, but it can also -handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for +The `BufferAllocator`_ is primarily an arena or nursery used for accounting of buffers (ArrowBuf instances). +As the name suggests, it can allocate new buffers associated with itself, but it can also +handle the accounting for buffers allocated elsewhere. For example, it handles the Java-side accounting for memory allocated in C++ and shared with Java using the C-Data Interface. In the code below it performs an allocation: .. code-block:: Java @@ -100,21 +100,21 @@ memory from a child allocator, those allocations are also reflected in all paren effectively sets the program-wide memory limit, and serves as the master bookkeeper for all memory allocations. Child allocators are not strictly required, but can help better organize code. For instance, a lower memory limit can -be set for a particular section of code. The child allocator can be closed when that section completes, -at which point it checks that that section didn't leak any memory. +be set for a particular section of code. The child allocator can be closed when that section completes, +at which point it checks that that section didn't leak any memory. Child allocators can also be named, which makes it easier to tell where an ArrowBuf came from during debugging. Reference counting ------------------ -Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers -deterministically, we use manual reference counting instead of the garbage collector. +Because direct memory is expensive to allocate and deallocate, allocators may share direct buffers. To managed shared buffers +deterministically, we use manual reference counting instead of the garbage collector. This simply means that each buffer has a counter keeping track of the number of references to the buffer, and the user is responsible for properly incrementing/decrementing the counter as the buffer is used. In Arrow, each ArrowBuf has an associated `ReferenceManager`_ that tracks the reference count. You can retrieve -it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, -and `ReferenceManager.retain`_ to increment it. +it with ArrowBuf.getReferenceManager(). The reference count is updated using `ReferenceManager.release`_ to decrement the count, +and `ReferenceManager.retain`_ to increment it. Of course, this is tedious and error-prone, so instead of directly working with buffers, we typically use higher-level APIs like ValueVector. Such classes generally implement Closeable/AutoCloseable and will automatically @@ -289,7 +289,7 @@ Finally, enabling the ``TRACE`` logging level will automatically provide this st | at (#8:1) Sometimes, explicitly passing allocators around is difficult. For example, it -can be hard to pass around extra state, like an allocator, through layers of +can be hard to pass around extra state, like an allocator, through layers of existing application or framework code. A global or singleton allocator instance can be useful here, though it should not be your first choice. @@ -370,7 +370,7 @@ Arrow’s memory model is based on the following basic concepts: leaks. - The same physical memory can be shared by multiple allocators and the allocator must provide an accounting paradigm for this purpose. - + Reserving Memory ---------------- @@ -384,17 +384,17 @@ Arrow provides two different ways to reserve memory: - ``AllocationReservation`` via BufferAllocator.newReservation(): Allows a short-term preallocation strategy so that a particular subsystem can ensure future memory is available to support a - particular request. - + particular request. + Reference Counting Details -------------------------- -Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. -A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, +Typically, the ReferenceManager implementation used is an instance of `BufferLedger`_. +A BufferLedger is a ReferenceManager that also maintains the relationship between an ``AllocationManager``, a ``BufferAllocator`` and one or more individual ``ArrowBuf``\ s -All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination -share the same reference count and either all will be valid or all will be invalid. +All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination +share the same reference count and either all will be valid or all will be invalid. For simplicity of accounting, we treat that memory as being used by one of the BufferAllocators associated with the memory. When that allocator releases its claim on that memory, the memory ownership is then moved to @@ -411,7 +411,7 @@ There are several Allocator types in Arrow Java: - ``ChildAllocator`` - A child allocator that derives from the root allocator Many BufferAllocators can reference the same piece of physical memory at the same -time. It is the AllocationManager’s responsibility to ensure that in this situation, +time. It is the AllocationManager’s responsibility to ensure that in this situation, all memory is accurately accounted for from the Root’s perspective and also to ensure that the memory is correctly released once all BufferAllocators have stopped using that memory. diff --git a/docs/source/java/overview.rst b/docs/source/java/overview.rst index 9d9cbad8a26c1..7780ee32ec9bc 100644 --- a/docs/source/java/overview.rst +++ b/docs/source/java/overview.rst @@ -54,10 +54,10 @@ but some modules are JNI bindings to the C++ library. - (Experimental) A library for converting JDBC data to Arrow data. - Native * - flight-core - - (Experimental) An RPC mechanism for transferring ValueVectors. + - An RPC mechanism for transferring ValueVectors. - Native * - flight-sql - - (Experimental) Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight. + - Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight. - Native * - flight-integration-tests - Integration tests for Flight RPC. diff --git a/docs/source/java/quickstartguide.rst b/docs/source/java/quickstartguide.rst index e358681c57830..1f3ec861d3f46 100644 --- a/docs/source/java/quickstartguide.rst +++ b/docs/source/java/quickstartguide.rst @@ -195,10 +195,10 @@ Example: Create a dataset of names (strings) and ages (32-bit signed integers). .. code-block:: shell VectorSchemaRoot created: - age name - 10 Dave - 20 Peter - 30 Mary + age name + 10 Dave + 20 Peter + 30 Mary Interprocess Communication (IPC) @@ -306,11 +306,11 @@ Example: Read the dataset from the previous example from an Arrow IPC file (rand Record batches in file: 1 VectorSchemaRoot read: - age name - 10 Dave - 20 Peter - 30 Mary + age name + 10 Dave + 20 Peter + 30 Mary More examples available at `Arrow Java Cookbook`_. -.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java \ No newline at end of file +.. _`Arrow Java Cookbook`: https://arrow.apache.org/cookbook/java diff --git a/docs/source/java/substrait.rst b/docs/source/java/substrait.rst index c5857dcc23f75..fa20dbd61dbfb 100644 --- a/docs/source/java/substrait.rst +++ b/docs/source/java/substrait.rst @@ -100,9 +100,9 @@ Here is an example of a Java program that queries a Parquet file using Java Subs .. code-block:: text // Results example: - FieldPath(0) FieldPath(1) FieldPath(2) FieldPath(3) - 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai - 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon + FieldPath(0) FieldPath(1) FieldPath(2) FieldPath(3) + 0 ALGERIA 0 haggle. carefully final deposits detect slyly agai + 1 ARGENTINA 1 al foxes promise slyly according to the regular accounts. bold requests alon Executing Projections and Filters Using Extended Expressions ============================================================ @@ -189,13 +189,13 @@ This Java program: .. code-block:: text - column-1 column-2 - 13 ROMANIA - ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account - 14 SAUDI ARABIA - ts. silent requests haggle. closely express packages sleep across the blithely - 12 VIETNAM - hely enticingly express accounts. even, final - 13 RUSSIA - requests against the platelets use never according to the quickly regular pint - 13 UNITED KINGDOM - eans boost carefully special requests. accounts are. carefull - 11 UNITED STATES - y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be + column-1 column-2 + 13 ROMANIA - ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account + 14 SAUDI ARABIA - ts. silent requests haggle. closely express packages sleep across the blithely + 12 VIETNAM - hely enticingly express accounts. even, final + 13 RUSSIA - requests against the platelets use never according to the quickly regular pint + 13 UNITED KINGDOM - eans boost carefully special requests. accounts are. carefull + 11 UNITED STATES - y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be .. _`Substrait`: https://substrait.io/ .. _`Substrait Java`: https://github.com/substrait-io/substrait-java diff --git a/docs/source/java/table.rst b/docs/source/java/table.rst index 603910f51694f..5aa95e153cea0 100644 --- a/docs/source/java/table.rst +++ b/docs/source/java/table.rst @@ -75,7 +75,7 @@ Tables are created from a ``VectorSchemaRoot`` as shown below. The memory buffer Table t = new Table(someVectorSchemaRoot); -If you now update the vectors held by the ``VectorSchemaRoot`` (using some version of `ValueVector#setSafe()`), it would reflect those changes, but the values in table *t* are unchanged. +If you now update the vectors held by the ``VectorSchemaRoot`` (using some version of ``ValueVector#setSafe()``), it would reflect those changes, but the values in table *t* are unchanged. Creating a Table from FieldVectors ********************************** @@ -243,7 +243,7 @@ It is important to recognize that rows are NOT reified as objects, but rather op Getting a row ************* -Calling `immutableRow()` on any table instance returns a new ``Row`` instance. +Calling ``immutableRow()`` on any table instance returns a new ``Row`` instance. .. code-block:: Java @@ -262,7 +262,7 @@ Since rows are iterable, you can traverse a table using a standard while loop: // do something useful here } -``Table`` implements `Iterable` so you can access rows directly from a table in an enhanced *for* loop: +``Table`` implements ``Iterable`` so you can access rows directly from a table in an enhanced *for* loop: .. code-block:: Java @@ -272,7 +272,7 @@ Since rows are iterable, you can traverse a table using a standard while loop: ... } -Finally, while rows are usually iterated in the order of the underlying data vectors, but they are also positionable using the `Row#setPosition()` method, so you can skip to a specific row. Row numbers are 0-based. +Finally, while rows are usually iterated in the order of the underlying data vectors, but they are also positionable using the ``Row#setPosition()`` method, so you can skip to a specific row. Row numbers are 0-based. .. code-block:: Java @@ -281,7 +281,7 @@ Finally, while rows are usually iterated in the order of the underlying data vec Any changes to position are applied to all the columns in the table. -Note that you must call `next()`, or `setPosition()` before accessing values via a row. Failure to do so results in a runtime exception. +Note that you must call ``next()``, or ``setPosition()`` before accessing values via a row. Failure to do so results in a runtime exception. Read operations using rows ************************** @@ -304,7 +304,7 @@ You can also get value using a nullable ``ValueHolder``. For example: This can be used to retrieve values without creating a new Object for each. -In addition to getting values, you can check if a value is null using `isNull()`. This is important if the vector contains any nulls, as asking for a value from a vector can cause NullPointerExceptions in some cases. +In addition to getting values, you can check if a value is null using ``isNull()``. This is important if the vector contains any nulls, as asking for a value from a vector can cause NullPointerExceptions in some cases. .. code-block:: Java @@ -352,13 +352,13 @@ Working with the C-Data interface The ability to work with native code is required for many Arrow features. This section describes how tables can be be exported for use with native code -Exporting works by converting the data to a ``VectorSchemaRoot`` instance and using the existing facilities to transfer the data. You could do it yourself, but that isn't ideal because conversion to a vector schema root breaks the immutability guarantees. Using the `exportTable()` methods in the `Data`_ class avoids this concern. +Exporting works by converting the data to a ``VectorSchemaRoot`` instance and using the existing facilities to transfer the data. You could do it yourself, but that isn't ideal because conversion to a vector schema root breaks the immutability guarantees. Using the ``exportTable()`` methods in the `Data`_ class avoids this concern. .. code-block:: Java Data.exportTable(bufferAllocator, table, dictionaryProvider, outArrowArray); -If the table contains dictionary-encoded vectors and was constructed with a ``DictionaryProvider``, the provider argument to `exportTable()` can be omitted and the table's provider attribute will be used: +If the table contains dictionary-encoded vectors and was constructed with a ``DictionaryProvider``, the provider argument to ``exportTable()`` can be omitted and the table's provider attribute will be used: .. code-block:: Java diff --git a/docs/source/java/vector.rst b/docs/source/java/vector.rst index abbbd1a236d6d..1c3e123cf50fb 100644 --- a/docs/source/java/vector.rst +++ b/docs/source/java/vector.rst @@ -226,7 +226,7 @@ A :class:`ListVector` is a vector that holds a list of values for each index. Wo For example, the code below shows how to build a :class:`ListVector` of int's using the writer :class:`UnionListWriter`. We build a vector from 0 to 9 and each index contains a list with values [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8], …, [0, 9, 18, 27, 36]]. List values can be added in any order so writing a list such as [3, 1, 2] would be just as valid. .. code-block:: Java - + try (BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); ListVector listVector = ListVector.empty("vector", allocator)) { UnionListWriter writer = listVector.getWriter(); @@ -240,7 +240,7 @@ For example, the code below shows how to build a :class:`ListVector` of int's us writer.endList(); } listVector.setValueCount(10); - } + } :class:`ListVector` values can be accessed either through the get API or through the reader class :class:`UnionListReader`. To read all the values, first enumerate through the indexes, and then enumerate through the inner list values. diff --git a/docs/source/python/api/arrays.rst b/docs/source/python/api/arrays.rst index e6f6c3dbbd3d1..aefed00b3d2e0 100644 --- a/docs/source/python/api/arrays.rst +++ b/docs/source/python/api/arrays.rst @@ -63,8 +63,8 @@ may expose data type-specific methods or properties. FixedSizeBinaryArray LargeBinaryArray LargeStringArray - BinaryViewArray, - StringViewArray, + BinaryViewArray + StringViewArray Time32Array Time64Array Date32Array diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index 928c607d139ce..5423eebfbab40 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -52,10 +52,10 @@ Aggregations Cumulative Functions -------------------- -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identity element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identity element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions do not detect overflow. They are also available in an overflow-checking variant, suffixed ``_checked``, which throws an ``ArrowInvalid`` exception when overflow is detected. @@ -173,7 +173,7 @@ variants which detect domain errors where appropriate. Comparisons ----------- -These functions expect two inputs of the same type. If one of the inputs is `null` +These functions expect two inputs of the same type. If one of the inputs is ``null`` they return ``null``. .. autosummary:: @@ -540,7 +540,6 @@ Compute Options AssumeTimezoneOptions CastOptions CountOptions - CountOptions CumulativeSumOptions DayOfWeekOptions DictionaryEncodeOptions @@ -566,7 +565,6 @@ Compute Options RoundToMultipleOptions RunEndEncodeOptions ScalarAggregateOptions - ScalarAggregateOptions SelectKOptions SetLookupOptions SliceOptions @@ -578,7 +576,6 @@ Compute Options StructFieldOptions TakeOptions TDigestOptions - TDigestOptions TrimOptions VarianceOptions WeekOptions diff --git a/docs/source/python/api/substrait.rst b/docs/source/python/api/substrait.rst index 66e88fcd279ae..1556be9dbd011 100644 --- a/docs/source/python/api/substrait.rst +++ b/docs/source/python/api/substrait.rst @@ -50,4 +50,4 @@ Utility .. autosummary:: :toctree: ../generated/ - get_supported_functions \ No newline at end of file + get_supported_functions diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst index c02059a4f8faa..ce3dfabb0e689 100644 --- a/docs/source/python/compute.rst +++ b/docs/source/python/compute.rst @@ -23,7 +23,7 @@ Compute Functions ================= Arrow supports logical compute operations over inputs of possibly -varying types. +varying types. The standard compute operations are provided by the :mod:`pyarrow.compute` module and can be used directly:: @@ -91,7 +91,7 @@ Grouped Aggregations ==================== PyArrow supports grouped aggregations over :class:`pyarrow.Table` through the -:meth:`pyarrow.Table.group_by` method. +:meth:`pyarrow.Table.group_by` method. The method will return a grouping declaration to which the hash aggregation functions can be applied:: @@ -300,7 +300,7 @@ Filtering by Expressions :class:`.Table` and :class:`.Dataset` can both be filtered using a boolean :class:`.Expression`. -The expression can be built starting from a +The expression can be built starting from a :func:`pyarrow.compute.field`. Comparisons and transformations can then be applied to one or more fields to build the filter expression you care about. @@ -325,7 +325,7 @@ in column ``"nums"`` by the ``bit_wise_and`` operation equals ``0``. Only the numbers where the last bit was ``0`` will return a ``0`` as the result of ``num & 1`` and as all numbers where the last bit is ``0`` are multiples of ``2`` we will be filtering for the even numbers only. - + Once we have our filter, we can provide it to the :meth:`.Table.filter` method to filter our table only for the matching rows: @@ -392,7 +392,7 @@ User-Defined Functions PyArrow allows defining and registering custom compute functions. These functions can then be called from Python as well as C++ (and potentially any other implementation wrapping Arrow C++, such as the R ``arrow`` package) -using their registered function name. +using their registered function name. UDF support is limited to scalar functions. A scalar function is a function which executes elementwise operations on arrays or scalars. In general, the output of a @@ -441,7 +441,7 @@ output type need to be defined. Using :func:`pyarrow.compute.register_scalar_fun function_docs, input_types, output_type) - + The implementation of a user-defined function always takes a first *context* parameter (named ``ctx`` in the example above) which is an instance of @@ -497,9 +497,9 @@ the GCD of one column with the scalar value 30. We will be re-using the category: [["A","B","C","D"]] Note that ``ds.field('')._call(...)`` returns a :func:`pyarrow.compute.Expression`. -The arguments passed to this function call are expressions, not scalar values +The arguments passed to this function call are expressions, not scalar values (notice the difference between :func:`pyarrow.scalar` and :func:`pyarrow.compute.scalar`, -the latter produces an expression). +the latter produces an expression). This expression is evaluated when the projection operator executes it. Projection Expressions diff --git a/docs/source/python/data.rst b/docs/source/python/data.rst index 9156157fcd0c2..f17475138c9a4 100644 --- a/docs/source/python/data.rst +++ b/docs/source/python/data.rst @@ -76,7 +76,7 @@ We use the name **logical type** because the **physical** storage may be the same for one or more types. For example, ``int64``, ``float64``, and ``timestamp[ms]`` all occupy 64 bits per value. -These objects are `metadata`; they are used for describing the data in arrays, +These objects are ``metadata``; they are used for describing the data in arrays, schemas, and record batches. In Python, they can be used in functions where the input data (e.g. Python objects) may be coerced to more than one Arrow type. @@ -99,7 +99,7 @@ types' children. For example, we can define a list of int32 values with: t6 = pa.list_(t1) t6 -A `struct` is a collection of named fields: +A ``struct`` is a collection of named fields: .. ipython:: python diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index daab36f9a7be9..00469fd57becf 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -575,28 +575,28 @@ Partitioning performance considerations Partitioning datasets has two aspects that affect performance: it increases the number of files and it creates a directory structure around the files. Both of these have benefits -as well as costs. Depending on the configuration and the size of your dataset, the costs -can outweigh the benefits. +as well as costs. Depending on the configuration and the size of your dataset, the costs +can outweigh the benefits. -Because partitions split up the dataset into multiple files, partitioned datasets can be -read and written with parallelism. However, each additional file adds a little overhead in -processing for filesystem interaction. It also increases the overall dataset size since +Because partitions split up the dataset into multiple files, partitioned datasets can be +read and written with parallelism. However, each additional file adds a little overhead in +processing for filesystem interaction. It also increases the overall dataset size since each file has some shared metadata. For example, each parquet file contains the schema and -group-level statistics. The number of partitions is a floor for the number of files. If -you partition a dataset by date with a year of data, you will have at least 365 files. If -you further partition by another dimension with 1,000 unique values, you will have up to +group-level statistics. The number of partitions is a floor for the number of files. If +you partition a dataset by date with a year of data, you will have at least 365 files. If +you further partition by another dimension with 1,000 unique values, you will have up to 365,000 files. This fine of partitioning often leads to small files that mostly consist of metadata. -Partitioned datasets create nested folder structures, and those allow us to prune which +Partitioned datasets create nested folder structures, and those allow us to prune which files are loaded in a scan. However, this adds overhead to discovering files in the dataset, as we'll need to recursively "list directory" to find the data files. Too fine partitions can cause problems here: Partitioning a dataset by date for a years worth -of data will require 365 list calls to find all the files; adding another column with +of data will require 365 list calls to find all the files; adding another column with cardinality 1,000 will make that 365,365 calls. The most optimal partitioning layout will depend on your data, access patterns, and which -systems will be reading the data. Most systems, including Arrow, should work across a +systems will be reading the data. Most systems, including Arrow, should work across a range of file sizes and partitioning layouts, but there are extremes you should avoid. These guidelines can help avoid some known worst cases: @@ -611,35 +611,35 @@ of file size. Arrow's file writer provides sensible defaults for group sizing in Configuring files open during a write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -When writing data to the disk, there are a few parameters that can be +When writing data to the disk, there are a few parameters that can be important to optimize the writes, such as the number of rows per file and the maximum number of open files allowed during the write. Set the maximum number of files opened with the ``max_open_files`` parameter of :meth:`write_dataset`. -If ``max_open_files`` is set greater than 0 then this will limit the maximum +If ``max_open_files`` is set greater than 0 then this will limit the maximum number of files that can be left open. This only applies to writing partitioned datasets, where rows are dispatched to the appropriate file depending on their partition values. If an attempt is made to open too many files then the least recently used file will be closed. If this setting is set too low you may end up fragmenting your data into many small files. -If your process is concurrently using other file handlers, either with a -dataset scanner or otherwise, you may hit a system file handler limit. For +If your process is concurrently using other file handlers, either with a +dataset scanner or otherwise, you may hit a system file handler limit. For example, if you are scanning a dataset with 300 files and writing out to 900 files, the total of 1200 files may be over a system limit. (On Linux, this might be a "Too Many Open Files" error.) You can either reduce this ``max_open_files`` setting or increase the file handler limit on your system. The default value is 900 which allows some number of files -to be open by the scanner before hitting the default Linux limit of 1024. +to be open by the scanner before hitting the default Linux limit of 1024. -Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. +Another important configuration used in :meth:`write_dataset` is ``max_rows_per_file``. Set the maximum number of rows written in each file with the ``max_rows_per_files`` parameter of :meth:`write_dataset`. -If ``max_rows_per_file`` is set greater than 0 then this will limit how many +If ``max_rows_per_file`` is set greater than 0 then this will limit how many rows are placed in any single file. Otherwise there will be no limit and one file will be created in each output directory unless files need to be closed to respect ``max_open_files``. This setting is the primary way to control file size. @@ -653,22 +653,22 @@ Configuring rows per group during a write The volume of data written to the disk per each group can be configured. This configuration includes a lower and an upper bound. -The minimum number of rows required to form a row group is +The minimum number of rows required to form a row group is defined with the ``min_rows_per_group`` parameter of :meth:`write_dataset`. .. note:: - If ``min_rows_per_group`` is set greater than 0 then this will cause the - dataset writer to batch incoming data and only write the row groups to the - disk when sufficient rows have accumulated. The final row group size may be - less than this value if other options such as ``max_open_files`` or + If ``min_rows_per_group`` is set greater than 0 then this will cause the + dataset writer to batch incoming data and only write the row groups to the + disk when sufficient rows have accumulated. The final row group size may be + less than this value if other options such as ``max_open_files`` or ``max_rows_per_file`` force smaller row group sizes. The maximum number of rows allowed per group is defined with the ``max_rows_per_group`` parameter of :meth:`write_dataset`. -If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split -up large incoming batches into multiple row groups. If this value is set then -``min_rows_per_group`` should also be set or else you may end up with very small +If ``max_rows_per_group`` is set greater than 0 then the dataset writer may split +up large incoming batches into multiple row groups. If this value is set then +``min_rows_per_group`` should also be set or else you may end up with very small row groups (e.g. if the incoming row group size is just barely larger than this value). Row groups are built into the Parquet and IPC/Feather formats but don't affect JSON or CSV. @@ -719,7 +719,7 @@ Customizing & inspecting written files By default the dataset API will create files named "part-i.format" where "i" is a integer generated during the write and "format" is the file format specified in the write_dataset call. For simple datasets it may be possible to know which files will be created but for -larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used +larger or partitioned datasets it is not so easy. The ``file_visitor`` keyword can be used to supply a visitor that will be called as each file is created: .. ipython:: python diff --git a/docs/source/python/dlpack.rst b/docs/source/python/dlpack.rst index f612ebabde5c9..024c2800e1107 100644 --- a/docs/source/python/dlpack.rst +++ b/docs/source/python/dlpack.rst @@ -90,4 +90,4 @@ Convert a PyArrow CPU array to PyTorch tensor: >>> import torch >>> torch.from_dlpack(array) - tensor([2, 0, 2, 4]) + tensor([2, 0, 2, 4]) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 8df0ef0b1fe99..83fce84f47c08 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -101,7 +101,7 @@ define the ``__arrow_array__`` method to return an Arrow array:: import pyarrow return pyarrow.array(..., type=type) -The ``__arrow_array__`` method takes an optional `type` keyword which is passed +The ``__arrow_array__`` method takes an optional ``type`` keyword which is passed through from :func:`pyarrow.array`. The method is allowed to return either a :class:`~pyarrow.Array` or a :class:`~pyarrow.ChunkedArray`. diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 5309250351d8e..23d10aaaad720 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -182,7 +182,7 @@ Example how you can read contents from a S3 bucket:: Note that it is important to configure :class:`S3FileSystem` with the correct -region for the bucket being used. If `region` is not set, the AWS SDK will +region for the bucket being used. If ``region`` is not set, the AWS SDK will choose a value, defaulting to 'us-east-1' if the SDK version is <1.8. Otherwise it will try to use a variety of heuristics (environment variables, configuration profile, EC2 metadata server) to resolve the region. @@ -233,7 +233,7 @@ generate a credentials file in the default location:: To connect to a public bucket without using any credentials, you must pass ``anonymous=True`` to :class:`GcsFileSystem`. Otherwise, the filesystem -will report ``Couldn't resolve host name`` since there are different host +will report ``Couldn't resolve host name`` since there are different host names for authenticated and public access. Example showing how you can read contents from a GCS bucket:: @@ -277,7 +277,7 @@ load time, since the library may not be in your LD_LIBRARY_PATH), and relies on some environment variables. * ``HADOOP_HOME``: the root of your installed Hadoop distribution. Often has - `lib/native/libhdfs.so`. + ``lib/native/libhdfs.so``. * ``JAVA_HOME``: the location of your Java SDK installation. @@ -314,7 +314,7 @@ For example:: # using this to read a partitioned dataset import pyarrow.dataset as ds ds.dataset("data/", filesystem=fs) - + Similarly for Azure Blob Storage:: import adlfs diff --git a/docs/source/python/getstarted.rst b/docs/source/python/getstarted.rst index d38fcadab288f..42e415c40b835 100644 --- a/docs/source/python/getstarted.rst +++ b/docs/source/python/getstarted.rst @@ -37,7 +37,7 @@ in tabular data. Arrow also provides support for various formats to get those tabular data in and out of disk and networks. Most commonly used formats are -Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). +Parquet (:ref:`parquet`) and the IPC format (:ref:`ipc`). Creating Arrays and Tables -------------------------- @@ -63,7 +63,7 @@ in tabular data when attached to a column name birthdays_table = pa.table([days, months, years], names=["days", "months", "years"]) - + birthdays_table See :ref:`data` for more details. @@ -75,7 +75,7 @@ Once you have tabular data, Arrow provides out of the box the features to save and restore that data for common formats like Parquet: -.. ipython:: python +.. ipython:: python import pyarrow.parquet as pq @@ -92,14 +92,14 @@ data will be as quick as possible reloaded_birthdays Saving and loading back data in arrow is usually done through -:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), +:ref:`Parquet `, :ref:`IPC format ` (:ref:`feather`), :ref:`CSV ` or :ref:`Line-Delimited JSON ` formats. Performing Computations ----------------------- Arrow ships with a bunch of compute functions that can be applied -to its arrays and tables, so through the compute functions +to its arrays and tables, so through the compute functions it's possible to apply transformations to the data .. ipython:: python @@ -122,7 +122,7 @@ smaller chunks import pyarrow.dataset as ds - ds.write_dataset(birthdays_table, "savedir", format="parquet", + ds.write_dataset(birthdays_table, "savedir", format="parquet", partitioning=ds.partitioning( pa.schema([birthdays_table.schema.field("years")]) )) @@ -151,8 +151,8 @@ how to project them, etc., refer to :ref:`dataset` documentation. Continuing from here -------------------- -For digging further into Arrow, you might want to read the -:doc:`PyArrow Documentation <./index>` itself or the +For digging further into Arrow, you might want to read the +:doc:`PyArrow Documentation <./index>` itself or the `Arrow Python Cookbook `_ diff --git a/docs/source/python/getting_involved.rst b/docs/source/python/getting_involved.rst index 7b3bcf2ac527a..9fda3c7c78488 100644 --- a/docs/source/python/getting_involved.rst +++ b/docs/source/python/getting_involved.rst @@ -54,7 +54,7 @@ used as foundations to build easier to use entities. exposed to the user are declared. In some cases, those files might directly import the entities from inner implementation if they want to expose it as is without modification. -* The ``lib.pyx`` file is where the majority of the core C++ libarrow +* The ``lib.pyx`` file is where the majority of the core C++ libarrow capabilities are exposed to Python. Most of the implementation of this module relies on included ``*.pxi`` files where the specific pieces are built. While being exposed to Python as ``pyarrow.lib`` its content @@ -73,4 +73,4 @@ used as foundations to build easier to use entities. PyArrow is also based on PyArrow C++, dedicated pieces of code that live in ``python/pyarrow/src/arrow/python`` directory and provide the low level code for capabilities like converting to and from numpy or pandas and the classes - that allow to use Python objects and callbacks in C++. \ No newline at end of file + that allow to use Python objects and callbacks in C++. diff --git a/docs/source/python/install.rst b/docs/source/python/install.rst index 4b966e6d2653d..12555c93067f9 100644 --- a/docs/source/python/install.rst +++ b/docs/source/python/install.rst @@ -83,7 +83,7 @@ While Arrow uses the OS-provided timezone database on Linux and macOS, it requir user-provided database on Windows. To download and extract the text version of the IANA timezone database follow the instructions in the C++ :ref:`download-timezone-database` or use pyarrow utility function -`pyarrow.util.download_tzdata_on_windows()` that does the same. +``pyarrow.util.download_tzdata_on_windows()`` that does the same. By default, the timezone database will be detected at ``%USERPROFILE%\Downloads\tzdata``. If the database has been downloaded in a different location, you will need to set diff --git a/docs/source/python/integration/extending.rst b/docs/source/python/integration/extending.rst index b380fea7e902c..d4d099bcf43c8 100644 --- a/docs/source/python/integration/extending.rst +++ b/docs/source/python/integration/extending.rst @@ -474,7 +474,7 @@ Toolchain Compatibility (Linux) The Python wheels for Linux are built using the `PyPA manylinux images `_ which use -the CentOS `devtoolset-9`. In addition to the other notes +the CentOS ``devtoolset-9``. In addition to the other notes above, if you are compiling C++ using these shared libraries, you will need to make sure you use a compatible toolchain as well or you might see a segfault during runtime. diff --git a/docs/source/python/integration/python_r.rst b/docs/source/python/integration/python_r.rst index 20627c3782d3c..ec5dfc366fdf9 100644 --- a/docs/source/python/integration/python_r.rst +++ b/docs/source/python/integration/python_r.rst @@ -29,7 +29,7 @@ marshaling and unmarshaling data. The article takes for granted that you have a ``Python`` environment with ``pyarrow`` correctly installed and an ``R`` environment with - ``arrow`` library correctly installed. + ``arrow`` library correctly installed. See `Python Install Instructions `_ and `R Install instructions `_ for further details. @@ -52,7 +52,7 @@ We could save such a function in a ``addthree.R`` file so that we can make it available for reuse. Once the ``addthree.R`` file is created we can invoke any of its functions -from Python using the +from Python using the `rpy2 `_ library which enables a R runtime within the Python interpreter. @@ -91,12 +91,12 @@ to access the ``R`` function and print the expected result: .. code-block:: bash - $ python addthree.py + $ python addthree.py 6 If instead of passing around basic data types we want to pass around Arrow Arrays, we can do so relying on the -`rpy2-arrow `_ +`rpy2-arrow `_ module which implements ``rpy2`` support for Arrow types. ``rpy2-arrow`` can be installed through ``pip``: @@ -189,7 +189,7 @@ Invoking the ``addthree.R`` script will print the outcome of adding .. code-block:: bash - $ R --silent -f addthree.R + $ R --silent -f addthree.R Array [ @@ -219,7 +219,7 @@ necessary to import an Arrow Array in R from the C Data interface. That work will be done by the ``addthree_cdata`` function which invokes the ``addthree`` function once the Array is imported. -Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the +Our ``addthree.R`` will thus have both the ``addthree_cdata`` and the ``addthree`` functions: .. code-block:: R @@ -261,7 +261,7 @@ Our ``addthree.py`` will thus become: # Import the pyarrow module that provides access to the C Data interface from pyarrow.cffi import ffi as arrow_c - # Allocate structures where we will export the Array data + # Allocate structures where we will export the Array data # and the Array schema. They will be released when we exit the with block. with arrow_c.new("struct ArrowArray*") as c_array, \ arrow_c.new("struct ArrowSchema*") as c_schema: @@ -274,7 +274,7 @@ Our ``addthree.py`` will thus become: array.type._export_to_c(c_schema_ptr) # Invoke the R addthree_cdata function passing the references - # to the array and schema C Data structures. + # to the array and schema C Data structures. # Those references are passed as strings as R doesn't have # native support for 64bit integers, so the integers are # converted to their string representation for R to convert it back. @@ -289,19 +289,19 @@ Our ``addthree.py`` will thus become: # Once the returned array is exported to a C Data infrastructure # we can import it back into pyarrow using Array._import_from_c py_array = pyarrow.Array._import_from_c(c_array_ptr, c_schema_ptr) - + print("RESULT", py_array) Running the newly changed ``addthree.py`` will now print the Array resulting -from adding ``3`` to all the elements of the original +from adding ``3`` to all the elements of the original ``pyarrow.array((1, 2, 3))`` array: .. code-block:: bash - $ python addthree.py + $ python addthree.py R[write to console]: Attaching package: ‘arrow’ RESULT [ 4, 5, 6 - ] \ No newline at end of file + ] diff --git a/docs/source/python/ipc.rst b/docs/source/python/ipc.rst index 27cd14a68853d..f55e8f8bc5dc3 100644 --- a/docs/source/python/ipc.rst +++ b/docs/source/python/ipc.rst @@ -76,12 +76,12 @@ this one can be created with :func:`~pyarrow.ipc.new_stream`: .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_stream(sink, batch.schema) as writer: for i in range(5): writer.write_batch(batch) -Here we used an in-memory Arrow buffer stream (``sink``), +Here we used an in-memory Arrow buffer stream (``sink``), but this could have been a socket or some other IO sink. When creating the ``StreamWriter``, we pass the schema, since the schema @@ -102,7 +102,7 @@ convenience function ``pyarrow.ipc.open_stream``: with pa.ipc.open_stream(buf) as reader: schema = reader.schema batches = [b for b in reader] - + schema len(batches) @@ -126,7 +126,7 @@ The :class:`~pyarrow.RecordBatchFileWriter` has the same API as .. ipython:: python sink = pa.BufferOutputStream() - + with pa.ipc.new_file(sink, batch.schema) as writer: for i in range(10): writer.write_batch(batch) @@ -164,7 +164,7 @@ DataFrame output: with pa.ipc.open_file(buf) as reader: df = reader.read_pandas() - + df[:5] Efficiently Writing and Reading Arrow Data diff --git a/docs/source/python/json.rst b/docs/source/python/json.rst index 99ecbc19a1230..eff6135d895a7 100644 --- a/docs/source/python/json.rst +++ b/docs/source/python/json.rst @@ -21,7 +21,7 @@ Reading JSON files ================== -Arrow supports reading columnar data from line-delimited JSON files. +Arrow supports reading columnar data from line-delimited JSON files. In this context, a JSON file consists of multiple JSON objects, one per line, representing individual data rows. For example, this file represents two rows of data with four columns "a", "b", "c", "d": diff --git a/docs/source/python/memory.rst b/docs/source/python/memory.rst index 23474b923718d..7b49d48ab20fa 100644 --- a/docs/source/python/memory.rst +++ b/docs/source/python/memory.rst @@ -46,7 +46,7 @@ parent-child relationships. There are many implementations of ``arrow::Buffer``, but they all provide a standard interface: a data pointer and length. This is similar to Python's -built-in `buffer protocol` and ``memoryview`` objects. +built-in ``buffer protocol`` and ``memoryview`` objects. A :class:`Buffer` can be created from any Python object implementing the buffer protocol by calling the :func:`py_buffer` function. Let's consider diff --git a/docs/source/python/orc.rst b/docs/source/python/orc.rst index bfa68fc34d895..76c293d742010 100644 --- a/docs/source/python/orc.rst +++ b/docs/source/python/orc.rst @@ -112,7 +112,7 @@ control various settings when writing an ORC file. * ``file_version``, the ORC format version to use. ``'0.11'`` ensures compatibility with older readers, while ``'0.12'`` is the newer one. -* ``stripe_size``, to control the approximate size of data within a column +* ``stripe_size``, to control the approximate size of data within a column stripe. This currently defaults to 64MB. See the :func:`~pyarrow.orc.write_table()` docstring for more details. diff --git a/docs/source/python/parquet.rst b/docs/source/python/parquet.rst index d4717897660b6..029ed4f1a3e15 100644 --- a/docs/source/python/parquet.rst +++ b/docs/source/python/parquet.rst @@ -32,7 +32,7 @@ performance data IO. Apache Arrow is an ideal in-memory transport layer for data that is being read or written with Parquet files. We have been concurrently developing the `C++ -implementation of +implementation of Apache Parquet `_, which includes a native, multithreaded C++ adapter to and from in-memory Arrow data. PyArrow includes Python bindings to this code, which thus enables reading diff --git a/docs/source/python/timestamps.rst b/docs/source/python/timestamps.rst index 64a2a354dddef..80a1b7280cbfa 100644 --- a/docs/source/python/timestamps.rst +++ b/docs/source/python/timestamps.rst @@ -24,7 +24,7 @@ Arrow/Pandas Timestamps Arrow timestamps are stored as a 64-bit integer with column metadata to associate a time unit (e.g. milliseconds, microseconds, or nanoseconds), and an -optional time zone. Pandas (`Timestamp`) uses a 64-bit integer representing +optional time zone. Pandas (``Timestamp``) uses a 64-bit integer representing nanoseconds and an optional time zone. Python/Pandas timestamp types without a associated time zone are referred to as "Time Zone Naive". Python/Pandas timestamp types with an associated time zone are @@ -51,8 +51,8 @@ This implies a few things when round-tripping timestamps: #. Timezone information is lost (all timestamps that result from converting from spark to arrow/pandas are "time zone naive"). #. Timestamps are truncated to microseconds. -#. The session time zone might have unintuitive impacts on - translation of timestamp values. +#. The session time zone might have unintuitive impacts on + translation of timestamp values. Spark to Pandas (through Apache Arrow) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -62,8 +62,8 @@ The following cases assume the Spark configuration :: - >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], - ... 'aware': [Timestamp(year=2019, month=1, day=1, + >>> pdf = pd.DataFrame({'naive': [datetime(2019, 1, 1, 0)], + ... 'aware': [Timestamp(year=2019, month=1, day=1, ... nanosecond=500, tz=timezone(timedelta(hours=-8)))]}) >>> pdf naive aware @@ -77,7 +77,7 @@ The following cases assume the Spark configuration +-------------------+-------------------+ |2019-01-01 00:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + Note that conversion of the aware timestamp is shifted to reflect the time assuming UTC (it represents the same instant in time). For naive timestamps, Spark treats them as being in the system local @@ -129,7 +129,7 @@ session time zone is still PST: |2019-01-01 00:00:00|2019-01-01 00:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas() naive aware 0 2019-01-01 2019-01-01 @@ -141,7 +141,7 @@ session time zone is still PST: aware 1 non-null datetime64[ns] dtypes: datetime64[ns](2) memory usage: 96.0 bytes - + Notice that, in addition to being a "time zone naive" timestamp, the 'aware' value will now differ when converting to an epoch offset. Spark does the conversion by first converting to the session time zone (or system local time zone if @@ -158,9 +158,9 @@ time: >>> (pst_df.toPandas()['aware'][0].timestamp()-pdf['aware'][0].timestamp())/3600 -8.0 -The same type of conversion happens with the data frame converted while -the session time zone was UTC. In this case both naive and aware -represent different instants in time (the naive instant is due to +The same type of conversion happens with the data frame converted while +the session time zone was UTC. In this case both naive and aware +represent different instants in time (the naive instant is due to the change in session time zone between creating data frames): :: @@ -179,9 +179,9 @@ the change in session time zone between creating data frames): Note that the surprising shift for aware doesn't happen when the session time zone is UTC (but the timestamps still become "time zone naive"): - + :: - + >>> spark.conf.set("spark.sql.session.timeZone", "UTC") >>> pst_df.show() +-------------------+-------------------+ @@ -189,7 +189,7 @@ still become "time zone naive"): +-------------------+-------------------+ |2019-01-01 08:00:00|2019-01-01 08:00:00| +-------------------+-------------------+ - + >>> pst_df.toPandas()['aware'][0] Timestamp('2019-01-01 08:00:00') >>> pdf['aware'][0] diff --git a/format/FlightSql.proto b/format/FlightSql.proto index bf3fcb6c3d229..6fca141d692a7 100644 --- a/format/FlightSql.proto +++ b/format/FlightSql.proto @@ -43,7 +43,6 @@ package arrow.flight.protocol.sql; * where there is one row per requested piece of metadata information. */ message CommandGetSqlInfo { - option (experimental) = true; /* * Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide @@ -1131,7 +1130,6 @@ enum Searchable { * The returned data should be ordered by data_type and then by type_name. */ message CommandGetXdbcTypeInfo { - option (experimental) = true; /* * Specifies the data type to search for the info. @@ -1153,7 +1151,6 @@ message CommandGetXdbcTypeInfo { * The returned data should be ordered by catalog_name. */ message CommandGetCatalogs { - option (experimental) = true; } /* @@ -1171,7 +1168,6 @@ message CommandGetCatalogs { * The returned data should be ordered by catalog_name, then db_schema_name. */ message CommandGetDbSchemas { - option (experimental) = true; /* * Specifies the Catalog to search for the tables. @@ -1219,7 +1215,6 @@ message CommandGetDbSchemas { * The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. */ message CommandGetTables { - option (experimental) = true; /* * Specifies the Catalog to search for the tables. @@ -1272,7 +1267,6 @@ message CommandGetTables { * The returned data should be ordered by table_type. */ message CommandGetTableTypes { - option (experimental) = true; } /* @@ -1293,7 +1287,6 @@ message CommandGetTableTypes { * The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. */ message CommandGetPrimaryKeys { - option (experimental) = true; /* * Specifies the catalog to search for the table. @@ -1348,7 +1341,6 @@ enum UpdateDeleteRules { * update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. */ message CommandGetExportedKeys { - option (experimental) = true; /* * Specifies the catalog to search for the foreign key table. @@ -1399,7 +1391,6 @@ message CommandGetExportedKeys { * - 4 = SET DEFAULT */ message CommandGetImportedKeys { - option (experimental) = true; /* * Specifies the catalog to search for the primary key table. @@ -1452,7 +1443,6 @@ message CommandGetImportedKeys { * - 4 = SET DEFAULT */ message CommandGetCrossReference { - option (experimental) = true; /** * The catalog name where the parent table is. @@ -1499,7 +1489,6 @@ message CommandGetCrossReference { * Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. */ message ActionCreatePreparedStatementRequest { - option (experimental) = true; // The valid SQL string to create a prepared statement for. string query = 1; @@ -1512,7 +1501,6 @@ message ActionCreatePreparedStatementRequest { * An embedded message describing a Substrait plan to execute. */ message SubstraitPlan { - option (experimental) = true; // The serialized substrait.Plan to create a prepared statement for. // XXX(ARROW-16902): this is bytes instead of an embedded message @@ -1529,7 +1517,6 @@ message SubstraitPlan { * Request message for the "CreatePreparedSubstraitPlan" action on a Flight SQL enabled backend. */ message ActionCreatePreparedSubstraitPlanRequest { - option (experimental) = true; // The serialized substrait.Plan to create a prepared statement for. SubstraitPlan plan = 1; @@ -1548,7 +1535,6 @@ message ActionCreatePreparedSubstraitPlanRequest { * The result should be wrapped in a google.protobuf.Any message. */ message ActionCreatePreparedStatementResult { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1570,7 +1556,6 @@ message ActionCreatePreparedStatementResult { * Closes server resources associated with the prepared statement handle. */ message ActionClosePreparedStatementRequest { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1581,7 +1566,6 @@ message ActionClosePreparedStatementRequest { * Begins a transaction. */ message ActionBeginTransactionRequest { - option (experimental) = true; } /* @@ -1592,7 +1576,6 @@ message ActionBeginTransactionRequest { * FLIGHT_SQL_TRANSACTION_SUPPORT_SAVEPOINT. */ message ActionBeginSavepointRequest { - option (experimental) = true; // The transaction to which a savepoint belongs. bytes transaction_id = 1; @@ -1610,7 +1593,6 @@ message ActionBeginSavepointRequest { * The result should be wrapped in a google.protobuf.Any message. */ message ActionBeginTransactionResult { - option (experimental) = true; // Opaque handle for the transaction on the server. bytes transaction_id = 1; @@ -1626,7 +1608,6 @@ message ActionBeginTransactionResult { * The result should be wrapped in a google.protobuf.Any message. */ message ActionBeginSavepointResult { - option (experimental) = true; // Opaque handle for the savepoint on the server. bytes savepoint_id = 1; @@ -1641,7 +1622,6 @@ message ActionBeginSavepointResult { * invalidated, as are all associated savepoints. */ message ActionEndTransactionRequest { - option (experimental) = true; enum EndTransaction { END_TRANSACTION_UNSPECIFIED = 0; @@ -1667,7 +1647,6 @@ message ActionEndTransactionRequest { * savepoints created after the current savepoint. */ message ActionEndSavepointRequest { - option (experimental) = true; enum EndSavepoint { END_SAVEPOINT_UNSPECIFIED = 0; @@ -1702,7 +1681,6 @@ message ActionEndSavepointRequest { * - GetFlightInfo: execute the query. */ message CommandStatementQuery { - option (experimental) = true; // The SQL syntax. string query = 1; @@ -1729,7 +1707,6 @@ message CommandStatementQuery { * - DoPut: execute the query. */ message CommandStatementSubstraitPlan { - option (experimental) = true; // A serialized substrait.Plan SubstraitPlan plan = 1; @@ -1742,7 +1719,6 @@ message CommandStatementSubstraitPlan { * This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. */ message TicketStatementQuery { - option (experimental) = true; // Unique identifier for the instance of the statement to execute. bytes statement_handle = 1; @@ -1770,7 +1746,6 @@ message TicketStatementQuery { * - GetFlightInfo: execute the prepared statement instance. */ message CommandPreparedStatementQuery { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1781,7 +1756,6 @@ message CommandPreparedStatementQuery { * for the RPC call DoPut to cause the server to execute the included SQL update. */ message CommandStatementUpdate { - option (experimental) = true; // The SQL syntax. string query = 1; @@ -1795,7 +1769,6 @@ message CommandStatementUpdate { * prepared statement handle as an update. */ message CommandPreparedStatementUpdate { - option (experimental) = true; // Opaque handle for the prepared statement on the server. bytes prepared_statement_handle = 1; @@ -1807,7 +1780,6 @@ message CommandPreparedStatementUpdate { * FlightData into the target destination. */ message CommandStatementIngest { - option (experimental) = true; // Options for table definition behavior message TableDefinitionOptions { @@ -1866,7 +1838,6 @@ message CommandStatementIngest { * in the request, containing results from the update. */ message DoPutUpdateResult { - option (experimental) = true; // The number of records updated. A return value of -1 represents // an unknown updated record count. @@ -1880,7 +1851,6 @@ message DoPutUpdateResult { * can continue as though the fields in this message were not provided or set to sensible default values. */ message DoPutPreparedStatementResult { - option (experimental) = true; // Represents a (potentially updated) opaque handle for the prepared statement on the server. // Because the handle could potentially be updated, any previous handles for this prepared @@ -1912,7 +1882,6 @@ message DoPutPreparedStatementResult { */ message ActionCancelQueryRequest { option deprecated = true; - option (experimental) = true; // The result of the GetFlightInfo RPC that initiated the query. // XXX(ARROW-16902): this must be a serialized FlightInfo, but is @@ -1931,7 +1900,6 @@ message ActionCancelQueryRequest { */ message ActionCancelQueryResult { option deprecated = true; - option (experimental) = true; enum CancelResult { // The cancellation status is unknown. Servers should avoid using diff --git a/go/arrow/cdata/cdata.go b/go/arrow/cdata/cdata.go index b86898277bf47..00d1f351eaf11 100644 --- a/go/arrow/cdata/cdata.go +++ b/go/arrow/cdata/cdata.go @@ -448,6 +448,7 @@ func (imp *cimporter) doImportArr(src *CArrowArray) error { defer func() { if imp.alloc.bufCount == 0 { C.ArrowArrayRelease(imp.arr) + C.free(unsafe.Pointer(imp.arr)) } }() diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go index 4455c8b782167..06fed69a77fe5 100644 --- a/go/arrow/csv/common.go +++ b/go/arrow/csv/common.go @@ -239,21 +239,31 @@ func WithStringsReplacer(replacer *strings.Replacer) Option { func validate(schema *arrow.Schema) { for i, f := range schema.Fields() { - switch ft := f.Type.(type) { - case *arrow.BooleanType: - case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type: - case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type: - case *arrow.Float16Type, *arrow.Float32Type, *arrow.Float64Type: - case *arrow.StringType, *arrow.LargeStringType: - case *arrow.TimestampType: - case *arrow.Date32Type, *arrow.Date64Type: - case *arrow.Decimal128Type, *arrow.Decimal256Type: - case *arrow.ListType, *arrow.LargeListType, *arrow.FixedSizeListType: - case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: - case arrow.ExtensionType: - case *arrow.NullType: - default: - panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, ft)) + if !typeSupported(f.Type) { + panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid data type %T", i, f.Name, f.Type)) } } } + +func typeSupported(dt arrow.DataType) bool { + switch dt := dt.(type) { + case *arrow.BooleanType: + case *arrow.Int8Type, *arrow.Int16Type, *arrow.Int32Type, *arrow.Int64Type: + case *arrow.Uint8Type, *arrow.Uint16Type, *arrow.Uint32Type, *arrow.Uint64Type: + case *arrow.Float16Type, *arrow.Float32Type, *arrow.Float64Type: + case *arrow.StringType, *arrow.LargeStringType: + case *arrow.TimestampType: + case *arrow.Date32Type, *arrow.Date64Type: + case *arrow.Decimal128Type, *arrow.Decimal256Type: + case *arrow.MapType: + return false + case arrow.ListLikeType: + return typeSupported(dt.Elem()) + case *arrow.BinaryType, *arrow.LargeBinaryType, *arrow.FixedSizeBinaryType: + case arrow.ExtensionType: + case *arrow.NullType: + default: + return false + } + return true +} diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go index 18f1083e6a9dc..46591a9a5adee 100644 --- a/go/arrow/csv/reader.go +++ b/go/arrow/csv/reader.go @@ -474,6 +474,10 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { return func(str string) { r.parseDate32(bldr, str) } + case *arrow.Date64Type: + return func(str string) { + r.parseDate64(bldr, str) + } case *arrow.Time32Type: return func(str string) { r.parseTime32(bldr, str, dt.Unit) @@ -486,17 +490,13 @@ func (r *Reader) initFieldConverter(bldr array.Builder) func(string) { return func(str string) { r.parseDecimal256(bldr, str, dt.Precision, dt.Scale) } - case *arrow.ListType: - return func(s string) { - r.parseList(bldr, s) - } - case *arrow.LargeListType: + case *arrow.FixedSizeListType: return func(s string) { - r.parseLargeList(bldr, s) + r.parseFixedSizeList(bldr.(*array.FixedSizeListBuilder), s, int(dt.Len())) } - case *arrow.FixedSizeListType: + case arrow.ListLikeType: return func(s string) { - r.parseFixedSizeList(bldr, s, int(dt.Len())) + r.parseListLike(bldr.(array.ListLikeBuilder), s) } case *arrow.BinaryType: return func(s string) { @@ -740,81 +740,67 @@ func (r *Reader) parseDate32(field array.Builder, str string) { field.(*array.Date32Builder).Append(arrow.Date32FromTime(tm)) } -func (r *Reader) parseTime32(field array.Builder, str string, unit arrow.TimeUnit) { +func (r *Reader) parseDate64(field array.Builder, str string) { if r.isNull(str) { field.AppendNull() return } - val, err := arrow.Time32FromString(str, unit) + tm, err := time.Parse("2006-01-02", str) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Time32Builder).Append(val) + field.(*array.Date64Builder).Append(arrow.Date64FromTime(tm)) } -func (r *Reader) parseDecimal128(field array.Builder, str string, prec, scale int32) { +func (r *Reader) parseTime32(field array.Builder, str string, unit arrow.TimeUnit) { if r.isNull(str) { field.AppendNull() return } - val, err := decimal128.FromString(str, prec, scale) + val, err := arrow.Time32FromString(str, unit) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Decimal128Builder).Append(val) + field.(*array.Time32Builder).Append(val) } -func (r *Reader) parseDecimal256(field array.Builder, str string, prec, scale int32) { +func (r *Reader) parseDecimal128(field array.Builder, str string, prec, scale int32) { if r.isNull(str) { field.AppendNull() return } - val, err := decimal256.FromString(str, prec, scale) + val, err := decimal128.FromString(str, prec, scale) if err != nil && r.err == nil { r.err = err field.AppendNull() return } - field.(*array.Decimal256Builder).Append(val) + field.(*array.Decimal128Builder).Append(val) } -func (r *Reader) parseList(field array.Builder, str string) { +func (r *Reader) parseDecimal256(field array.Builder, str string, prec, scale int32) { if r.isNull(str) { field.AppendNull() return } - if !(strings.HasPrefix(str, "{") && strings.HasSuffix(str, "}")) { - r.err = errors.New("invalid list format. should start with '{' and end with '}'") - return - } - str = strings.Trim(str, "{}") - listBldr := field.(*array.ListBuilder) - listBldr.Append(true) - if len(str) == 0 { - // we don't want to create the csv reader if we already know the - // string is empty - return - } - valueBldr := listBldr.ValueBuilder() - reader := csv.NewReader(strings.NewReader(str)) - items, err := reader.Read() - if err != nil { + + val, err := decimal256.FromString(str, prec, scale) + if err != nil && r.err == nil { r.err = err + field.AppendNull() return } - for _, str := range items { - r.initFieldConverter(valueBldr)(str) - } + field.(*array.Decimal256Builder).Append(val) } -func (r *Reader) parseLargeList(field array.Builder, str string) { +func (r *Reader) parseListLike(field array.ListLikeBuilder, str string) { if r.isNull(str) { field.AppendNull() return @@ -824,14 +810,13 @@ func (r *Reader) parseLargeList(field array.Builder, str string) { return } str = strings.Trim(str, "{}") - largeListBldr := field.(*array.LargeListBuilder) - largeListBldr.Append(true) + field.Append(true) if len(str) == 0 { // we don't want to create the csv reader if we already know the // string is empty return } - valueBldr := largeListBldr.ValueBuilder() + valueBldr := field.ValueBuilder() reader := csv.NewReader(strings.NewReader(str)) items, err := reader.Read() if err != nil { @@ -843,7 +828,7 @@ func (r *Reader) parseLargeList(field array.Builder, str string) { } } -func (r *Reader) parseFixedSizeList(field array.Builder, str string, n int) { +func (r *Reader) parseFixedSizeList(field *array.FixedSizeListBuilder, str string, n int) { if r.isNull(str) { field.AppendNull() return @@ -853,14 +838,13 @@ func (r *Reader) parseFixedSizeList(field array.Builder, str string, n int) { return } str = strings.Trim(str, "{}") - fixedSizeListBldr := field.(*array.FixedSizeListBuilder) - fixedSizeListBldr.Append(true) + field.Append(true) if len(str) == 0 { // we don't want to create the csv reader if we already know the // string is empty return } - valueBldr := fixedSizeListBldr.ValueBuilder() + valueBldr := field.ValueBuilder() reader := csv.NewReader(strings.NewReader(str)) items, err := reader.Read() if err != nil { diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go index b6654dd1984ea..65453db015a7e 100644 --- a/go/arrow/csv/reader_test.go +++ b/go/arrow/csv/reader_test.go @@ -357,6 +357,8 @@ func testCSVReader(t *testing.T, filepath string, withHeader bool, stringsCanBeN {Name: "large_binary", Type: arrow.BinaryTypes.LargeBinary}, {Name: "fixed_size_binary", Type: &arrow.FixedSizeBinaryType{ByteWidth: 3}}, {Name: "uuid", Type: types.NewUUIDType()}, + {Name: "date32", Type: arrow.PrimitiveTypes.Date32}, + {Name: "date64", Type: arrow.PrimitiveTypes.Date64}, }, nil, ) @@ -420,6 +422,8 @@ rec[0]["binary"]: ["\x00\x01\x02"] rec[0]["large_binary"]: ["\x00\x01\x02"] rec[0]["fixed_size_binary"]: ["\x00\x01\x02"] rec[0]["uuid"]: ["00000000-0000-0000-0000-000000000001"] +rec[0]["date32"]: [19121] +rec[0]["date64"]: [1652054400000] rec[1]["bool"]: [false] rec[1]["i8"]: [-2] rec[1]["i16"]: [-2] @@ -442,6 +446,8 @@ rec[1]["binary"]: [(null)] rec[1]["large_binary"]: [(null)] rec[1]["fixed_size_binary"]: [(null)] rec[1]["uuid"]: ["00000000-0000-0000-0000-000000000002"] +rec[1]["date32"]: [19121] +rec[1]["date64"]: [1652054400000] rec[2]["bool"]: [(null)] rec[2]["i8"]: [(null)] rec[2]["i16"]: [(null)] @@ -464,6 +470,8 @@ rec[2]["binary"]: [(null)] rec[2]["large_binary"]: [(null)] rec[2]["fixed_size_binary"]: [(null)] rec[2]["uuid"]: [(null)] +rec[2]["date32"]: [(null)] +rec[2]["date64"]: [(null)] `, str1Value, str1Value, str2Value, str2Value) got, want := out.String(), want require.Equal(t, want, got) diff --git a/go/arrow/csv/testdata/header.csv b/go/arrow/csv/testdata/header.csv index 50be4f5e4daca..68ae18a499dee 100644 --- a/go/arrow/csv/testdata/header.csv +++ b/go/arrow/csv/testdata/header.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;large_str;ts;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +bool;i8;i16;i32;i64;u8;u16;u32;u64;f16;f32;f64;str;large_str;ts;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid;date32;date64 +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001;2022-05-09;2022-05-09 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002;2022-05-09;2022-05-09 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/testdata/types.csv b/go/arrow/csv/testdata/types.csv index d32941f4b214d..91c0cf3b252b3 100644 --- a/go/arrow/csv/testdata/types.csv +++ b/go/arrow/csv/testdata/types.csv @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # -## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;large_string;timestamp;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid -true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001 -false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002 -null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file +## supported types: bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float16;float32;float64;string;large_string;timestamp;list(i64);large_list(i64);fixed_size_list(i64);binary;large_binary;fixed_size_binary;uuid;date32;date64 +true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;1.1;str-1;str-1;2022-05-09T00:01:01;{1,2,3};{1,2,3};{1,2,3};AAEC;AAEC;AAEC;00000000-0000-0000-0000-000000000001;2022-05-09;2022-05-09 +false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;2.2;;;2022-05-09T23:59:59;{};{};{4,5,6};;;;00000000-0000-0000-0000-000000000002;2022-05-09;2022-05-09 +null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null;null \ No newline at end of file diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go index 90c26ac981078..237437c0441e1 100644 --- a/go/arrow/csv/transformer.go +++ b/go/arrow/csv/transformer.go @@ -29,7 +29,7 @@ import ( "github.com/apache/arrow/go/v17/arrow/array" ) -func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string)string) []string { +func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, stringsReplacer func(string) string) []string { res := make([]string, col.Len()) switch typ.(type) { case *arrow.BooleanType: @@ -215,62 +215,25 @@ func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array, st res[i] = w.nullValue } } - case *arrow.ListType: - arr := col.(*array.List) - listVals, offsets := arr.ListValues(), arr.Offsets() - for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64(offsets[i]), int64(offsets[i+1])) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { - res[i] = w.nullValue - } - } - case *arrow.LargeListType: - arr := col.(*array.LargeList) - listVals, offsets := arr.ListValues(), arr.Offsets() - for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64(offsets[i]), int64(offsets[i+1])) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { - res[i] = w.nullValue - } - } - case *arrow.FixedSizeListType: - arr := col.(*array.FixedSizeList) + case arrow.ListLikeType: + arr := col.(array.ListLike) listVals := arr.ListValues() for i := 0; i < arr.Len(); i++ { - if arr.IsValid(i) { - list := array.NewSlice(listVals, int64((arr.Len()-1)*i), int64((arr.Len()-1)*(i+1))) - var b bytes.Buffer - b.Write([]byte{'{'}) - writer := csv.NewWriter(&b) - writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) - writer.Flush() - b.Truncate(b.Len() - 1) - b.Write([]byte{'}'}) - res[i] = b.String() - list.Release() - } else { + if arr.IsNull(i) { res[i] = w.nullValue + continue } + start, end := arr.ValueOffsets(i) + list := array.NewSlice(listVals, start, end) + var b bytes.Buffer + b.Write([]byte{'{'}) + writer := csv.NewWriter(&b) + writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer)) + writer.Flush() + b.Truncate(b.Len() - 1) + b.Write([]byte{'}'}) + res[i] = b.String() + list.Release() } case *arrow.BinaryType: arr := col.(*array.Binary) diff --git a/go/arrow/flight/flightsql/client.go b/go/arrow/flight/flightsql/client.go index e594191c35fdf..c6794820dc172 100644 --- a/go/arrow/flight/flightsql/client.go +++ b/go/arrow/flight/flightsql/client.go @@ -1119,24 +1119,10 @@ func (p *PreparedStatement) Execute(ctx context.Context, opts ...grpc.CallOption return nil, err } - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return nil, err - } - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return nil, err - } - if err = wr.Close(); err != nil { - return nil, err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return nil, err - } + desc, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return nil, err } - return p.client.getFlightInfo(ctx, desc, opts...) } @@ -1156,23 +1142,9 @@ func (p *PreparedStatement) ExecutePut(ctx context.Context, opts ...grpc.CallOpt return err } - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return err - } - - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return err - } - if err = wr.Close(); err != nil { - return err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return err - } + _, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return err } return nil @@ -1200,23 +1172,9 @@ func (p *PreparedStatement) ExecutePoll(ctx context.Context, retryDescriptor *fl } if retryDescriptor == nil { - if p.hasBindParameters() { - pstream, err := p.client.Client.DoPut(ctx, opts...) - if err != nil { - return nil, err - } - - wr, err := p.writeBindParameters(pstream, desc) - if err != nil { - return nil, err - } - if err = wr.Close(); err != nil { - return nil, err - } - pstream.CloseSend() - if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { - return nil, err - } + desc, err = p.bindParameters(ctx, desc, opts...) + if err != nil { + return nil, err } } return p.client.Client.PollFlightInfo(ctx, desc, opts...) @@ -1248,7 +1206,7 @@ func (p *PreparedStatement) ExecuteUpdate(ctx context.Context, opts ...grpc.Call return } if p.hasBindParameters() { - wr, err = p.writeBindParameters(pstream, desc) + wr, err = p.writeBindParametersToStream(pstream, desc) if err != nil { return } @@ -1283,7 +1241,36 @@ func (p *PreparedStatement) hasBindParameters() bool { return (p.paramBinding != nil && p.paramBinding.NumRows() > 0) || (p.streamBinding != nil) } -func (p *PreparedStatement) writeBindParameters(pstream pb.FlightService_DoPutClient, desc *pb.FlightDescriptor) (*flight.Writer, error) { +func (p *PreparedStatement) bindParameters(ctx context.Context, desc *pb.FlightDescriptor, opts ...grpc.CallOption) (*flight.FlightDescriptor, error) { + if p.hasBindParameters() { + pstream, err := p.client.Client.DoPut(ctx, opts...) + if err != nil { + return nil, err + } + wr, err := p.writeBindParametersToStream(pstream, desc) + if err != nil { + return nil, err + } + if err = wr.Close(); err != nil { + return nil, err + } + pstream.CloseSend() + if err = p.captureDoPutPreparedStatementHandle(pstream); err != nil { + return nil, err + } + + cmd := pb.CommandPreparedStatementQuery{PreparedStatementHandle: p.handle} + desc, err = descForCommand(&cmd) + if err != nil { + return nil, err + } + return desc, nil + } + return desc, nil +} + +// XXX: this does not capture the updated handle. Prefer bindParameters. +func (p *PreparedStatement) writeBindParametersToStream(pstream pb.FlightService_DoPutClient, desc *pb.FlightDescriptor) (*flight.Writer, error) { if p.paramBinding != nil { wr := flight.NewRecordWriter(pstream, ipc.WithSchema(p.paramBinding.Schema())) wr.SetFlightDescriptor(desc) diff --git a/go/arrow/flight/flightsql/client_test.go b/go/arrow/flight/flightsql/client_test.go index 727fe02aa7063..33da79167c4ae 100644 --- a/go/arrow/flight/flightsql/client_test.go +++ b/go/arrow/flight/flightsql/client_test.go @@ -448,9 +448,9 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)}) // mocked DoPut result - doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(updatedHandle)} + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(updatedHandle)} resdata, _ := proto.Marshal(doPutPreparedStatementResult) - putResult := &pb.PutResult{ AppMetadata: resdata } + putResult := &pb.PutResult{AppMetadata: resdata} // mocked client stream for DoPut mockedPut := &mockDoPutClient{} @@ -461,7 +461,7 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteParamBinding() { mockedPut.On("CloseSend").Return(nil) mockedPut.On("Recv").Return(putResult, nil) - infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(handle)} + infoCmd := &pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(updatedHandle)} desc := getDesc(infoCmd) s.mockClient.On("GetFlightInfo", desc.Type, desc.Cmd, s.callOpts).Return(&emptyFlightInfo, nil) @@ -525,9 +525,9 @@ func (s *FlightSqlClientSuite) TestPreparedStatementExecuteReaderBinding() { expectedDesc := getDesc(&pb.CommandPreparedStatementQuery{PreparedStatementHandle: []byte(query)}) // mocked DoPut result - doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(query)} + doPutPreparedStatementResult := &pb.DoPutPreparedStatementResult{PreparedStatementHandle: []byte(query)} resdata, _ := proto.Marshal(doPutPreparedStatementResult) - putResult := &pb.PutResult{ AppMetadata: resdata } + putResult := &pb.PutResult{AppMetadata: resdata} // mocked client stream for DoPut mockedPut := &mockDoPutClient{} diff --git a/go/go.mod b/go/go.mod index 972940ee3c299..8fdfea3dbe5eb 100644 --- a/go/go.mod +++ b/go/go.mod @@ -21,7 +21,7 @@ go 1.21 require ( github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c github.com/andybalholm/brotli v1.1.0 - github.com/apache/thrift v0.19.0 + github.com/apache/thrift v0.20.0 github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 github.com/goccy/go-json v0.10.2 github.com/golang/snappy v0.0.4 @@ -36,18 +36,18 @@ require ( github.com/zeebo/xxh3 v1.0.2 golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 golang.org/x/sync v0.7.0 - golang.org/x/sys v0.19.0 - golang.org/x/tools v0.20.0 + golang.org/x/sys v0.20.0 + golang.org/x/tools v0.21.0 golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 gonum.org/v1/gonum v0.15.0 google.golang.org/grpc v1.63.2 - google.golang.org/protobuf v1.33.0 + google.golang.org/protobuf v1.34.1 modernc.org/sqlite v1.29.6 ) require ( github.com/google/uuid v1.6.0 - github.com/hamba/avro/v2 v2.20.1 + github.com/hamba/avro/v2 v2.21.1 github.com/substrait-io/substrait-go v0.4.2 github.com/tidwall/sjson v1.2.5 ) @@ -75,8 +75,8 @@ require ( github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.0 // indirect golang.org/x/mod v0.17.0 // indirect - golang.org/x/net v0.24.0 // indirect - golang.org/x/text v0.14.0 // indirect + golang.org/x/net v0.25.0 // indirect + golang.org/x/text v0.15.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de // indirect gopkg.in/yaml.v3 v3.0.1 // indirect modernc.org/gc/v3 v3.0.0-20240107210532-573471604cb6 // indirect diff --git a/go/go.sum b/go/go.sum index 0a45cb751f77e..c2db1a72ccf2d 100644 --- a/go/go.sum +++ b/go/go.sum @@ -8,8 +8,8 @@ github.com/alecthomas/repr v0.2.0 h1:HAzS41CIzNW5syS8Mf9UwXhNH1J9aix/BvDRf1Ml2Yk github.com/alecthomas/repr v0.2.0/go.mod h1:Fr0507jx4eOXV7AlPV6AVZLYrLIuIeSOWtW57eE/O/4= github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= -github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk= -github.com/apache/thrift v0.19.0/go.mod h1:SUALL216IiaOw2Oy+5Vs9lboJ/t9g40C+G07Dc0QC1I= +github.com/apache/thrift v0.20.0 h1:631+KvYbsBZxmuJjYwhezVsrfc/TbqtZV4QcxOX1fOI= +github.com/apache/thrift v0.20.0/go.mod h1:hOk1BQqcp2OLzGsyVXdfMk7YFlMxK3aoEVhjD06QhB8= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= @@ -43,8 +43,8 @@ github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26 h1:Xim43kblpZXfIBQsbu github.com/google/pprof v0.0.0-20221118152302-e6195bd50e26/go.mod h1:dDKJzRmX4S37WGHujM7tX//fmj1uioxKzKxz3lo4HJo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/hamba/avro/v2 v2.20.1 h1:3WByQiVn7wT7d27WQq6pvBRC00FVOrniP6u67FLA/2E= -github.com/hamba/avro/v2 v2.20.1/go.mod h1:xHiKXbISpb3Ovc809XdzWow+XGTn+Oyf/F9aZbTLAig= +github.com/hamba/avro/v2 v2.21.1 h1:400/jTdLWQ3ib58y83VXlTJKijRouYQszY1SO0cMGt4= +github.com/hamba/avro/v2 v2.21.1/go.mod h1:ouJ4PkiAEP49u0lAtQyd5Gv04MehKj+7lXwD3zpLpY0= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= @@ -111,25 +111,25 @@ github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ= github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= -golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= -golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= +golang.org/x/crypto v0.23.0 h1:dIJU/v2J8Mdglj/8rJ6UUOM3Zc9zLZxVZwwxMooUSAI= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 h1:LfspQV/FYTatPTr/3HzIcmiUFH7PGP+OQ6mgDYo3yuQ= golang.org/x/exp v0.0.0-20240222234643-814bf88cf225/go.mod h1:CxmFvTBINI24O/j8iY7H1xHzx2i4OsyguNBmN/uPtqc= golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w= -golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8= +golang.org/x/net v0.25.0 h1:d/OCCoBEUq33pjydKrGQhw7IlUPI2Oylr+8qLx49kac= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= -golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/tools v0.20.0 h1:hz/CVckiOxybQvFw6h7b/q80NTr9IUQb4s1IIzW7KNY= -golang.org/x/tools v0.20.0/go.mod h1:WvitBU7JJf6A4jOdg4S1tviW9bhUxkgeCui/0JHctQg= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.21.0 h1:qc0xYgIbsSDt9EyWz05J5wfa7LOVW0YTLOXrqdLAWIw= +golang.org/x/tools v0.21.0/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.15.0 h1:2lYxjRbTYyxkJxlhC+LvJIx3SsANPdRybu1tGj9/OrQ= @@ -138,8 +138,8 @@ google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de h1: google.golang.org/genproto/googleapis/rpc v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:H4O17MA/PE9BsGx3w+a+W2VOLLD1Qf7oJneAoU6WktY= google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= -google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= -google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg= +google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/go/parquet/internal/encoding/types.go b/go/parquet/internal/encoding/types.go index 51f48c797488f..147c1746c515a 100644 --- a/go/parquet/internal/encoding/types.go +++ b/go/parquet/internal/encoding/types.go @@ -185,7 +185,7 @@ func (b *PooledBufferWriter) Reserve(nbytes int) { b.buf = bufferPool.Get().(*memory.Buffer) } - newCap := utils.Max(b.buf.Cap()+b.offset, 256) + newCap := utils.Max(b.buf.Cap(), 256) for newCap < b.pos+nbytes { newCap = bitutil.NextPowerOf2(newCap) } diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java index 7d50676688e0f..7420a8c23dd48 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/binder/ColumnBinderArrowTypeVisitor.java @@ -256,4 +256,9 @@ public ColumnBinder visit(ArrowType.Interval type) { public ColumnBinder visit(ArrowType.Duration type) { throw new UnsupportedOperationException("No column binder implemented for type " + type); } + + @Override + public ColumnBinder visit(ArrowType.ListView type) { + throw new UnsupportedOperationException("No column binder implemented for type " + type); + } } diff --git a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java index bc6139cc84c54..99873dadad242 100644 --- a/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java +++ b/java/c/src/main/java/org/apache/arrow/c/BufferImportTypeVisitor.java @@ -53,6 +53,7 @@ import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.util.DataSizeRoundingUtil; /** @@ -328,4 +329,9 @@ public List visit(ArrowType.Interval type) { public List visit(ArrowType.Duration type) { return Arrays.asList(maybeImportBitmap(type), importFixedBytes(type, 1, DurationVector.TYPE_WIDTH)); } + + @Override + public List visit(ListView type) { + throw new UnsupportedOperationException("Importing buffers for view type: " + type + " not supported"); + } } diff --git a/java/flight/flight-core/pom.xml b/java/flight/flight-core/pom.xml index 163b4c24031b1..4c1002ae75f04 100644 --- a/java/flight/flight-core/pom.xml +++ b/java/flight/flight-core/pom.xml @@ -20,7 +20,7 @@ flight-core jar Arrow Flight Core - (Experimental)An RPC mechanism for transferring ValueVectors. + An RPC mechanism for transferring ValueVectors. 1 diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java index 991d0ed6a043b..8fc2002207e24 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/CallStatus.java @@ -49,6 +49,7 @@ public class CallStatus { public static final CallStatus UNAUTHORIZED = FlightStatusCode.UNAUTHORIZED.toStatus(); public static final CallStatus UNIMPLEMENTED = FlightStatusCode.UNIMPLEMENTED.toStatus(); public static final CallStatus UNAVAILABLE = FlightStatusCode.UNAVAILABLE.toStatus(); + public static final CallStatus RESOURCE_EXHAUSTED = FlightStatusCode.RESOURCE_EXHAUSTED.toStatus(); /** * Create a new status. diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java index 3d96877ba02de..09a2c7afda106 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/FlightStatusCode.java @@ -71,6 +71,11 @@ public enum FlightStatusCode { * should send this code only if it has not done any work. */ UNAVAILABLE, + /** + * Some resource has been exhausted, perhaps a per-user quota, or perhaps the entire file system is out of space. + * (see: https://grpc.github.io/grpc/core/md_doc_statuscodes.html) + */ + RESOURCE_EXHAUSTED ; /** diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java index 7091caa5e98bc..af22cd8aade22 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/ServerSessionMiddleware.java @@ -26,8 +26,6 @@ /** * Middleware for handling Flight SQL Sessions including session cookie handling. - * - * Currently experimental. */ public class ServerSessionMiddleware implements FlightServerMiddleware { Factory factory; diff --git a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java index 7f0dcf2da3f0d..a2d9a85aaa442 100644 --- a/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java +++ b/java/flight/flight-core/src/main/java/org/apache/arrow/flight/grpc/StatusUtils.java @@ -74,6 +74,8 @@ public static Status.Code toGrpcStatusCode(FlightStatusCode code) { return Code.UNIMPLEMENTED; case UNAVAILABLE: return Code.UNAVAILABLE; + case RESOURCE_EXHAUSTED: + return Code.RESOURCE_EXHAUSTED; default: return Code.UNKNOWN; } @@ -101,7 +103,7 @@ public static FlightStatusCode fromGrpcStatusCode(Status.Code code) { case PERMISSION_DENIED: return FlightStatusCode.UNAUTHORIZED; case RESOURCE_EXHAUSTED: - return FlightStatusCode.INVALID_ARGUMENT; + return FlightStatusCode.RESOURCE_EXHAUSTED; case FAILED_PRECONDITION: return FlightStatusCode.INVALID_ARGUMENT; case ABORTED: diff --git a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java index 9912a26ea340a..730ea30a2f598 100644 --- a/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java +++ b/java/flight/flight-core/src/test/java/org/apache/arrow/flight/grpc/TestStatusUtils.java @@ -48,4 +48,26 @@ public void testParseTrailers() { Assertions.assertTrue(callStatus.metadata().containsKey("content-type")); Assertions.assertEquals("text/html", callStatus.metadata().get("content-type")); } + + @Test + public void testGrpcResourceExhaustedTranslatedToFlightStatus() { + Status status = Status.RESOURCE_EXHAUSTED; + + CallStatus callStatus = StatusUtils.fromGrpcStatus(status); + Assertions.assertEquals(FlightStatusCode.RESOURCE_EXHAUSTED, callStatus.code()); + + FlightStatusCode flightStatusCode = StatusUtils.fromGrpcStatusCode(status.getCode()); + Assertions.assertEquals(FlightStatusCode.RESOURCE_EXHAUSTED, flightStatusCode); + } + + @Test + public void testFlightResourceExhaustedTranslatedToGrpcStatua() { + CallStatus callStatus = CallStatus.RESOURCE_EXHAUSTED; + + Status.Code grpcStatusCode = StatusUtils.toGrpcStatusCode(callStatus.code()); + Assertions.assertEquals(Status.RESOURCE_EXHAUSTED.getCode(), grpcStatusCode); + + Status grpcStatus = StatusUtils.toGrpcStatus(callStatus); + Assertions.assertEquals(Status.RESOURCE_EXHAUSTED.getCode(), grpcStatus.getCode()); + } } diff --git a/java/flight/flight-sql-jdbc-core/pom.xml b/java/flight/flight-sql-jdbc-core/pom.xml index 2e0de90fcf8bc..ef3f2469b73dd 100644 --- a/java/flight/flight-sql-jdbc-core/pom.xml +++ b/java/flight/flight-sql-jdbc-core/pom.xml @@ -126,8 +126,8 @@ org.bouncycastle - bcpkix-jdk15on - 1.70 + bcpkix-jdk18on + 1.78.1 diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java index fd9127c226910..70a58ff440ed4 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/AvaticaParameterBinder.java @@ -254,6 +254,11 @@ public Boolean visit(ArrowType.Interval type) { public Boolean visit(ArrowType.Duration type) { return new DurationAvaticaParameterConverter(type).bindParameter(vector, typedValue, index); } + + @Override + public Boolean visit(ArrowType.ListView type) { + throw new UnsupportedOperationException("Binding is not yet supported for type " + type); + } } } diff --git a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java index 93b5faaef32c7..6ec33fafcfa46 100644 --- a/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java +++ b/java/flight/flight-sql-jdbc-core/src/main/java/org/apache/arrow/driver/jdbc/utils/ConvertUtils.java @@ -274,6 +274,11 @@ public AvaticaParameter visit(ArrowType.Interval type) { public AvaticaParameter visit(ArrowType.Duration type) { return new DurationAvaticaParameterConverter(type).createParameter(field); } + + @Override + public AvaticaParameter visit(ArrowType.ListView type) { + throw new UnsupportedOperationException("AvaticaParameter not yet supported for type " + type); + } } } diff --git a/java/flight/flight-sql/pom.xml b/java/flight/flight-sql/pom.xml index cf466ab1720cf..f5926d6e68485 100644 --- a/java/flight/flight-sql/pom.xml +++ b/java/flight/flight-sql/pom.xml @@ -20,7 +20,7 @@ flight-sql jar Arrow Flight SQL - (Experimental)Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight + Contains utility classes to expose Flight SQL semantics for clients and servers over Arrow Flight 1 diff --git a/java/performance/pom.xml b/java/performance/pom.xml index c819e6393d78f..e9023ece080a3 100644 --- a/java/performance/pom.xml +++ b/java/performance/pom.xml @@ -40,61 +40,61 @@ org.openjdk.jmh jmh-core ${jmh.version} - test - - - org.openjdk.jmh - jmh-generator-annprocess - ${jmh.version} - provided org.apache.arrow arrow-vector ${arrow.vector.classifier} - test org.apache.arrow arrow-memory-core - test org.apache.arrow arrow-memory-netty - test + runtime org.apache.avro avro ${dep.avro.version} - test org.apache.arrow arrow-avro - test com.h2database h2 2.2.224 - test + runtime org.apache.arrow arrow-jdbc - test org.apache.arrow arrow-algorithm - test + + org.apache.maven.plugins + maven-compiler-plugin + + + + org.openjdk.jmh + jmh-generator-annprocess + ${jmh.version} + + + + org.apache.maven.plugins maven-shade-plugin diff --git a/java/performance/src/test/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/adapter/AvroAdapterBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java similarity index 99% rename from java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java index fd3940b4c872c..f6dab83b7cd0c 100644 --- a/java/performance/src/test/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/adapter/jdbc/JdbcAdapterBenchmarks.java @@ -54,6 +54,7 @@ * Benchmarks for Jdbc adapter. */ public class JdbcAdapterBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VALUE_COUNT = 3000; @@ -355,5 +356,6 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java index 1c3af77e73a05..c9fc5cc4bef9c 100644 --- a/java/performance/src/test/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/algorithm/search/ParallelSearcherBenchmarks.java @@ -43,6 +43,7 @@ * Benchmarks for {@link ParallelSearcher}. */ public class ParallelSearcherBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024 * 1024; @@ -112,4 +113,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/memory/AllocatorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/AllocatorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/AllocatorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/ArrowBufBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/ArrowBufBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/ArrowBufBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/ArrowBufBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/util/ArrowBufPointerBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java index 4d0dfcb5da80d..f1dc2d79eff83 100644 --- a/java/performance/src/test/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpersBenchmarks.java @@ -48,7 +48,7 @@ public class ByteFunctionHelpersBenchmarks { */ @State(Scope.Benchmark) public static class ArrowEqualState { - + // checkstyle:off: MissingJavadocMethod private static final int BUFFER_CAPACITY = 7; private BufferAllocator allocator; @@ -135,4 +135,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/BaseValueVectorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java index 5f6e5ca28fbab..e29b889c6e7a8 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/BitVectorHelperBenchmarks.java @@ -41,6 +41,7 @@ * Benchmarks for {@link BitVectorHelper}. */ public class BitVectorHelperBenchmarks { + // checkstyle:off: MissingJavadocMethod /** * State object for general benchmarks. @@ -226,4 +227,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/DecimalVectorBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java index 874e0d9f82ee7..36a633e5e1b6e 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/Float8Benchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/Float8Benchmarks.java @@ -40,6 +40,7 @@ */ @State(Scope.Benchmark) public class Float8Benchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -119,4 +120,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java similarity index 98% rename from java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java index 079672e9f2a98..2938591737f06 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/FloatingPointBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/FloatingPointBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class FloatingPointBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -130,5 +131,6 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java index 036768d445e55..99674058970a6 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/IntBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/IntBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class IntBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -107,4 +108,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java index 1ab4b7bc20dad..a7ce4e04fee87 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VarCharBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VarCharBenchmarks.java @@ -39,6 +39,7 @@ */ @State(Scope.Benchmark) public class VarCharBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -99,4 +100,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java index 7eee981f13327..62c54606e6da6 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VariableWidthVectorBenchmarks.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class VariableWidthVectorBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_CAPACITY = 16 * 1024; @@ -127,4 +128,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java index 416d126419e56..e8e8c0cfbc1f3 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VectorLoaderBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VectorLoaderBenchmark.java @@ -40,6 +40,7 @@ * Benchmarks for {@link VectorLoader}. */ public class VectorLoaderBenchmark { + // checkstyle:off: MissingJavadocMethod private static final int ALLOCATOR_CAPACITY = 1024 * 1024; @@ -114,4 +115,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java index d125172450004..b464f888fa85f 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/VectorUnloaderBenchmark.java @@ -41,6 +41,7 @@ */ @State(Scope.Benchmark) public class VectorUnloaderBenchmark { + // checkstyle:off: MissingJavadocMethod private static final int ALLOCATOR_CAPACITY = 1024 * 1024; @@ -106,4 +107,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java similarity index 100% rename from java/performance/src/test/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/dictionary/DictionaryEncoderBenchmarks.java diff --git a/java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java b/java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java rename to java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java index 7a2537cbb8820..18efff11db9ff 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/ipc/WriteChannelBenchmark.java @@ -41,6 +41,7 @@ * Benchmarks for {@link WriteChannel}. */ public class WriteChannelBenchmark { + // checkstyle:off: MissingJavadocMethod /** * State object for align benchmark. @@ -84,4 +85,5 @@ public static void main(String[] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java index c0882821e9cc4..b608bb4c1c590 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/ipc/message/ArrowRecordBatchBenchmarks.java @@ -42,6 +42,7 @@ */ @State(Scope.Benchmark) public class ArrowRecordBatchBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_CAPACITY = 16 * 1024; @@ -95,4 +96,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java b/java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java similarity index 97% rename from java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java rename to java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java index 5142f4bdb8d0d..486862859f122 100644 --- a/java/performance/src/test/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java +++ b/java/performance/src/main/java/org/apache/arrow/vector/util/TransferPairBenchmarks.java @@ -42,6 +42,7 @@ */ @State(Scope.Benchmark) public class TransferPairBenchmarks { + // checkstyle:off: MissingJavadocMethod private static final int VECTOR_LENGTH = 1024; @@ -120,4 +121,5 @@ public static void main(String [] args) throws RunnerException { new Runner(opt).run(); } + // checkstyle:on: MissingJavadocMethod } diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 9fe40f2319bfd..72df4779793f0 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -129,6 +129,11 @@ name: "Duration", fields: [{name: "unit", type: short, valueType: TimeUnit}], complex: false + }, + { + name: "ListView", + fields: [], + complex: true } ] } diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index 5c0565ee27175..eeb964c055f71 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -26,7 +26,7 @@ import java.math.BigDecimal; <@pp.dropOutputFile /> -<#list ["List", "LargeList"] as listName> +<#list ["List", "ListView", "LargeList"] as listName> <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/Union${listName}Writer.java" /> @@ -59,6 +59,10 @@ public class Union${listName}Writer extends AbstractFieldWriter { private static final int OFFSET_WIDTH = 4; + <#if listName = "ListView"> + private static final long SIZE_WIDTH = 4; + + public Union${listName}Writer(${listName}Vector vector) { this(vector, NullableStructWriterFactory.getNullableStructWriterFactoryInstance()); } @@ -193,6 +197,24 @@ public void endList() { setPosition(idx() + 1); listStarted = false; } + <#elseif listName == "ListView"> + @Override + public void startList() { + vector.startNewValue(idx()); + writer.setPosition(vector.getOffsetBuffer().getInt((idx()) * OFFSET_WIDTH)); + listStarted = true; + } + + @Override + public void endList() { + int sizeUptoIdx = 0; + for (int i = 0; i < idx(); i++) { + sizeUptoIdx += vector.getSizeBuffer().getInt(i * SIZE_WIDTH); + } + vector.getSizeBuffer().setInt(idx() * SIZE_WIDTH, writer.idx() - sizeUptoIdx); + setPosition(idx() + 1); + listStarted = false; + } <#else> @Override public void startList() { diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 956bc91e9185c..243bd832255c2 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -39,7 +39,7 @@ @SuppressWarnings("unused") public class UnionReader extends AbstractFieldReader { - private static final int NUM_SUPPORTED_TYPES = 48; + private static final int NUM_SUPPORTED_TYPES = 49; private BaseReader[] readers = new BaseReader[NUM_SUPPORTED_TYPES]; public UnionVector data; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java index 2f80775a48f58..ec700a0dc2592 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthViewVector.java @@ -46,7 +46,7 @@ */ public abstract class BaseVariableWidthViewVector extends BaseValueVector implements VariableWidthFieldVector { // A single element of a view comprises 16 bytes - protected static final int ELEMENT_SIZE = 16; + public static final int ELEMENT_SIZE = 16; public static final int INITIAL_VIEW_VALUE_ALLOCATION = 4096; private static final int INITIAL_BYTE_COUNT = INITIAL_VIEW_VALUE_ALLOCATION * ELEMENT_SIZE; private static final int MAX_BUFFER_SIZE = (int) Math.min(MAX_ALLOCATION_SIZE, Integer.MAX_VALUE); @@ -70,14 +70,14 @@ public abstract class BaseVariableWidthViewVector extends BaseValueVector implem * * */ // 12 byte unsigned int to track inline views - protected static final int INLINE_SIZE = 12; + public static final int INLINE_SIZE = 12; // The first 4 bytes of view are allocated for length - protected static final int LENGTH_WIDTH = 4; + public static final int LENGTH_WIDTH = 4; // The second 4 bytes of view are allocated for prefix width - protected static final int PREFIX_WIDTH = 4; + public static final int PREFIX_WIDTH = 4; // The third 4 bytes of view are allocated for buffer index - protected static final int BUF_INDEX_WIDTH = 4; - protected static final byte[] EMPTY_BYTE_ARRAY = new byte[]{}; + public static final int BUF_INDEX_WIDTH = 4; + public static final byte[] EMPTY_BYTE_ARRAY = new byte[]{}; protected ArrowBuf validityBuffer; // The view buffer is used to store the variable width view elements protected ArrowBuf viewBuffer; @@ -158,6 +158,15 @@ public ArrowBuf getDataBuffer() { return viewBuffer; } + /** + * Get the buffers that store the data for views in the vector. + * + * @return buffer + */ + public List getDataBuffers() { + return dataBuffers; + } + /** * BaseVariableWidthViewVector doesn't support offset buffer. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java index 9725693348a48..4eeb92a0c9199 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferLayout.java @@ -28,12 +28,18 @@ public class BufferLayout { /** * Enumeration of the different logical types a buffer can have. + * Data buffer is common to most of the layouts. + * Offset buffer is used for variable width types. + * Validity buffer is used for nullable types. + * Type buffer is used for Union types. + * Size buffer is used for ListView and LargeListView types. */ public enum BufferType { DATA("DATA"), OFFSET("OFFSET"), VALIDITY("VALIDITY"), - TYPE("TYPE_ID"); + TYPE("TYPE_ID"), + SIZE("SIZE"); private final String name; @@ -57,6 +63,7 @@ public String getName() { private static final BufferLayout VALUES_32 = new BufferLayout(BufferType.DATA, 32); private static final BufferLayout VALUES_16 = new BufferLayout(BufferType.DATA, 16); private static final BufferLayout VALUES_8 = new BufferLayout(BufferType.DATA, 8); + private static final BufferLayout SIZE_BUFFER = new BufferLayout(BufferType.SIZE, 32); public static BufferLayout typeBuffer() { return TYPE_BUFFER; @@ -70,6 +77,10 @@ public static BufferLayout largeOffsetBuffer() { return LARGE_OFFSET_BUFFER; } + public static BufferLayout sizeBuffer() { + return SIZE_BUFFER; + } + /** * Returns a databuffer for the given bitwidth. Only supports powers of two between 8 and 128 * inclusive. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 18032528c86d8..ea92efdc55f61 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -101,7 +101,7 @@ public TypeLayout visit(Timestamp type) { } @Override - public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + public TypeLayout visit(ArrowType.List type) { List vectors = asList( BufferLayout.validityVector(), BufferLayout.offsetBuffer() @@ -109,6 +109,16 @@ public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) return new TypeLayout(vectors); } + @Override + public TypeLayout visit(ArrowType.ListView type) { + List vectors = asList( + BufferLayout.validityVector(), + BufferLayout.offsetBuffer(), + BufferLayout.sizeBuffer() + ); + return new TypeLayout(vectors); + } + @Override public TypeLayout visit(ArrowType.LargeList type) { List vectors = asList( @@ -312,11 +322,17 @@ public Integer visit(Timestamp type) { } @Override - public Integer visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + public Integer visit(ArrowType.List type) { // validity buffer + offset buffer return 2; } + @Override + public Integer visit(ArrowType.ListView type) { + // validity buffer + offset buffer + size buffer + return 3; + } + @Override public Integer visit(ArrowType.LargeList type) { // validity buffer + offset buffer diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java index 8768a90c80b83..9a92ce5060b1b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -19,6 +19,7 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -121,7 +122,7 @@ public VectorSchemaRoot(Schema schema, List fieldVectors, int rowCo * Creates a new set of empty vectors corresponding to the given schema. */ public static VectorSchemaRoot create(Schema schema, BufferAllocator allocator) { - List fieldVectors = new ArrayList<>(); + List fieldVectors = new ArrayList<>(schema.getFields().size()); for (Field field : schema.getFields()) { FieldVector vector = field.createVector(allocator); fieldVectors.add(vector); @@ -160,7 +161,7 @@ public void clear() { } public List getFieldVectors() { - return fieldVectors.stream().collect(Collectors.toList()); + return Collections.unmodifiableList(fieldVectors); } /** @@ -236,7 +237,7 @@ public int getRowCount() { */ public void setRowCount(int rowCount) { this.rowCount = rowCount; - for (FieldVector v : getFieldVectors()) { + for (FieldVector v : fieldVectors) { v.setValueCount(rowCount); } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java index 56220d270fa9b..28da2a86a53c8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/RangeEqualsVisitor.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.function.BiFunction; +import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.util.ByteFunctionHelpers; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.BaseFixedWidthVector; @@ -165,7 +166,10 @@ public Boolean visit(BaseLargeVariableWidthVector left, Range range) { @Override public Boolean visit(BaseVariableWidthViewVector left, Range range) { - throw new UnsupportedOperationException("View vectors are not supported."); + if (!validate(left)) { + return false; + } + return compareBaseVariableWidthViewVectors(range); } @Override @@ -450,6 +454,85 @@ protected boolean compareBaseLargeVariableWidthVectors(Range range) { return true; } + protected boolean compareBaseVariableWidthViewVectors(Range range) { + BaseVariableWidthViewVector leftVector = (BaseVariableWidthViewVector) left; + BaseVariableWidthViewVector rightVector = (BaseVariableWidthViewVector) right; + + final ArrowBuf leftViewBuffer = leftVector.getDataBuffer(); + final ArrowBuf rightViewBuffer = rightVector.getDataBuffer(); + + final int elementSize = BaseVariableWidthViewVector.ELEMENT_SIZE; + final int lengthWidth = BaseVariableWidthViewVector.LENGTH_WIDTH; + final int prefixWidth = BaseVariableWidthViewVector.PREFIX_WIDTH; + final int bufIndexWidth = BaseVariableWidthViewVector.BUF_INDEX_WIDTH; + + List leftDataBuffers = leftVector.getDataBuffers(); + List rightDataBuffers = rightVector.getDataBuffers(); + + for (int i = 0; i < range.getLength(); i++) { + int leftIndex = range.getLeftStart() + i; + int rightIndex = range.getRightStart() + i; + + boolean isNull = leftVector.isNull(leftIndex); + if (isNull != rightVector.isNull(rightIndex)) { + return false; + } + + if (isNull) { + continue; + } + + int startLeftByteOffset = leftIndex * elementSize; + + int startRightByteOffset = rightIndex * elementSize; + + int leftDataBufferValueLength = leftVector.getValueLength(leftIndex); + int rightDataBufferValueLength = rightVector.getValueLength(rightIndex); + + if (leftDataBufferValueLength != rightDataBufferValueLength) { + return false; + } + + if (leftDataBufferValueLength > BaseVariableWidthViewVector.INLINE_SIZE) { + // if the value is stored in the dataBuffers + int leftDataBufferIndex = leftViewBuffer.getInt(startLeftByteOffset + lengthWidth + prefixWidth); + int rightDataBufferIndex = rightViewBuffer.getInt(startRightByteOffset + lengthWidth + prefixWidth); + + final int leftDataOffset = + leftViewBuffer.getInt(startLeftByteOffset + lengthWidth + prefixWidth + bufIndexWidth); + final int rightDataOffset = + rightViewBuffer.getInt(startRightByteOffset + lengthWidth + prefixWidth + bufIndexWidth); + + ArrowBuf leftDataBuffer = leftDataBuffers.get(leftDataBufferIndex); + ArrowBuf rightDataBuffer = rightDataBuffers.get(rightDataBufferIndex); + + // check equality in the considered string stored in the dataBuffers + int retDataBuf = ByteFunctionHelpers.equal( + leftDataBuffer, leftDataOffset, leftDataOffset + leftDataBufferValueLength, + rightDataBuffer, rightDataOffset, rightDataOffset + rightDataBufferValueLength); + + if (retDataBuf == 0) { + return false; + } + } else { + // if the value is stored in the view + final int leftDataOffset = startLeftByteOffset + lengthWidth; + final int rightDataOffset = startRightByteOffset + lengthWidth; + + // check equality in the considered string stored in the view + int retDataBuf = ByteFunctionHelpers.equal( + leftViewBuffer, leftDataOffset, leftDataOffset + leftDataBufferValueLength, + rightViewBuffer, rightDataOffset, rightDataOffset + rightDataBufferValueLength); + + if (retDataBuf == 0) { + return false; + } + } + + } + return true; + } + protected boolean compareListVectors(Range range) { ListVector leftVector = (ListVector) left; ListVector rightVector = (ListVector) right; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java index 9bbe5c1b8997c..aaef161a563be 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/compare/TypeEqualsVisitor.java @@ -88,7 +88,7 @@ public Boolean visit(BaseLargeVariableWidthVector left, Void value) { @Override public Boolean visit(BaseVariableWidthViewVector left, Void value) { - throw new UnsupportedOperationException("View vectors are not supported."); + return compareField(left.getField(), right.getField()); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 7906d90c2fff0..7c4015299a6cd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -355,16 +355,8 @@ public int getInnerValueCountAt(int index) { offsetBuffer.getInt(index * OFFSET_WIDTH); } - /** Return if value at index is null (this implementation is always false). */ - @Override - public boolean isNull(int index) { - return false; - } - - /** Return if value at index is empty (this implementation is always false). */ - public boolean isEmpty(int index) { - return false; - } + /** Return if value at index is empty. */ + public abstract boolean isEmpty(int index); /** Starts a new repeated value. */ public int startNewValue(int index) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java new file mode 100644 index 0000000000000..73a25738854f3 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueViewVector.java @@ -0,0 +1,405 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; + +import java.util.Collections; +import java.util.Iterator; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.util.Preconditions; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.DensityAwareVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; + +public abstract class BaseRepeatedValueViewVector extends BaseValueVector + implements RepeatedValueVector, BaseListVector { + + public static final FieldVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; + public static final String DATA_VECTOR_NAME = "$data$"; + + public static final byte OFFSET_WIDTH = 4; + public static final byte SIZE_WIDTH = 4; + protected ArrowBuf offsetBuffer; + protected ArrowBuf sizeBuffer; + protected FieldVector vector; + protected final CallBack repeatedCallBack; + protected int valueCount; + protected long offsetAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH; + protected long sizeAllocationSizeInBytes = INITIAL_VALUE_ALLOCATION * SIZE_WIDTH; + private final String name; + + protected String defaultDataVectorName = DATA_VECTOR_NAME; + + protected BaseRepeatedValueViewVector(String name, BufferAllocator allocator, CallBack callBack) { + this(name, allocator, DEFAULT_DATA_VECTOR, callBack); + } + + protected BaseRepeatedValueViewVector( + String name, BufferAllocator allocator, FieldVector vector, CallBack callBack) { + super(allocator); + this.name = name; + this.offsetBuffer = allocator.getEmpty(); + this.sizeBuffer = allocator.getEmpty(); + this.vector = Preconditions.checkNotNull(vector, "data vector cannot be null"); + this.repeatedCallBack = callBack; + this.valueCount = 0; + } + + @Override + public String getName() { + return name; + } + + @Override + public boolean allocateNewSafe() { + boolean dataAlloc = false; + try { + allocateBuffers(); + dataAlloc = vector.allocateNewSafe(); + } catch (Exception e) { + clear(); + return false; + } finally { + if (!dataAlloc) { + clear(); + } + } + return dataAlloc; + } + + private void allocateBuffers() { + offsetBuffer = allocateBuffers(offsetAllocationSizeInBytes); + sizeBuffer = allocateBuffers(sizeAllocationSizeInBytes); + } + + private ArrowBuf allocateBuffers(final long size) { + final int curSize = (int) size; + ArrowBuf buffer = allocator.buffer(curSize); + buffer.readerIndex(0); + buffer.setZero(0, buffer.capacity()); + return buffer; + } + + @Override + public void reAlloc() { + reallocateBuffers(); + vector.reAlloc(); + } + + protected void reallocateBuffers() { + reallocOffsetBuffer(); + reallocSizeBuffer(); + } + + private void reallocOffsetBuffer() { + final long currentBufferCapacity = offsetBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (offsetAllocationSizeInBytes > 0) { + newAllocationSize = offsetAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_VALUE_ALLOCATION * OFFSET_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + newAllocationSize = Math.min(newAllocationSize, (long) OFFSET_WIDTH * Integer.MAX_VALUE); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE || newAllocationSize <= offsetBuffer.capacity()) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, offsetBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + offsetBuffer.getReferenceManager().release(1); + offsetBuffer = newBuf; + offsetAllocationSizeInBytes = newAllocationSize; + } + + private void reallocSizeBuffer() { + final long currentBufferCapacity = sizeBuffer.capacity(); + long newAllocationSize = currentBufferCapacity * 2; + if (newAllocationSize == 0) { + if (sizeAllocationSizeInBytes > 0) { + newAllocationSize = sizeAllocationSizeInBytes; + } else { + newAllocationSize = INITIAL_VALUE_ALLOCATION * SIZE_WIDTH * 2; + } + } + + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + newAllocationSize = Math.min(newAllocationSize, (long) SIZE_WIDTH * Integer.MAX_VALUE); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE || newAllocationSize <= sizeBuffer.capacity()) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, sizeBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + sizeBuffer.getReferenceManager().release(1); + sizeBuffer = newBuf; + sizeAllocationSizeInBytes = newAllocationSize; + } + + @Override + public FieldVector getDataVector() { + return vector; + } + + @Override + public void setInitialCapacity(int numRecords) { + offsetAllocationSizeInBytes = (numRecords) * OFFSET_WIDTH; + sizeAllocationSizeInBytes = (numRecords) * SIZE_WIDTH; + if (vector instanceof BaseFixedWidthVector || vector instanceof BaseVariableWidthVector) { + vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } else { + vector.setInitialCapacity(numRecords); + } + } + + @Override + public void setInitialCapacity(int numRecords, double density) { + if ((numRecords * density) >= Integer.MAX_VALUE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); + } + + offsetAllocationSizeInBytes = numRecords * OFFSET_WIDTH; + sizeAllocationSizeInBytes = numRecords * SIZE_WIDTH; + + int innerValueCapacity = Math.max((int) (numRecords * density), 1); + + if (vector instanceof DensityAwareVector) { + ((DensityAwareVector) vector).setInitialCapacity(innerValueCapacity, density); + } else { + vector.setInitialCapacity(innerValueCapacity); + } + } + + /** + * Specialized version of setInitialTotalCapacity() for ListViewVector. + * This is used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. + * This is very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. + * In such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount, but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param totalNumberOfElements the total number of elements to allow + * for in this vector across all records. + */ + public void setInitialTotalCapacity(int numRecords, int totalNumberOfElements) { + offsetAllocationSizeInBytes = numRecords * OFFSET_WIDTH; + sizeAllocationSizeInBytes = numRecords * SIZE_WIDTH; + vector.setInitialCapacity(totalNumberOfElements); + } + + @Override + public int getValueCapacity() { + throw new UnsupportedOperationException( + "Get value capacity is not supported in RepeatedValueVector"); + } + + protected int getOffsetBufferValueCapacity() { + return capAtMaxInt(offsetBuffer.capacity() / OFFSET_WIDTH); + } + + protected int getSizeBufferValueCapacity() { + return capAtMaxInt(sizeBuffer.capacity() / SIZE_WIDTH); + } + + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + return (valueCount * OFFSET_WIDTH) + (valueCount * SIZE_WIDTH) + vector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + + int innerVectorValueCount = 0; + + for (int i = 0; i < valueCount; i++) { + innerVectorValueCount += sizeBuffer.getInt(i * SIZE_WIDTH); + } + + return (valueCount * OFFSET_WIDTH) + (valueCount * SIZE_WIDTH) + + vector.getBufferSizeFor(innerVectorValueCount); + } + + @Override + public Iterator iterator() { + return Collections.singleton(getDataVector()).iterator(); + } + + @Override + public void clear() { + offsetBuffer = releaseBuffer(offsetBuffer); + sizeBuffer = releaseBuffer(sizeBuffer); + vector.clear(); + valueCount = 0; + super.clear(); + } + + @Override + public void reset() { + offsetBuffer.setZero(0, offsetBuffer.capacity()); + sizeBuffer.setZero(0, sizeBuffer.capacity()); + vector.reset(); + valueCount = 0; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return new ArrowBuf[0]; + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + while (valueCount > getOffsetBufferValueCapacity()) { + reallocateBuffers(); + } + final int childValueCount = valueCount == 0 ? 0 : getLengthOfChildVector(); + vector.setValueCount(childValueCount); + } + + protected int getLengthOfChildVector() { + int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0); + int minOffset = offsetBuffer.getInt(0); + for (int i = 0; i < valueCount; i++) { + int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH); + int currentSum = currentOffset + currentSize; + + maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum); + minOffset = Math.min(minOffset, currentOffset); + } + + return maxOffsetSizeSum - minOffset; + } + + protected int getLengthOfChildVectorByIndex(int index) { + int maxOffsetSizeSum = offsetBuffer.getInt(0) + sizeBuffer.getInt(0); + int minOffset = offsetBuffer.getInt(0); + for (int i = 0; i < index; i++) { + int currentOffset = offsetBuffer.getInt(i * OFFSET_WIDTH); + int currentSize = sizeBuffer.getInt(i * SIZE_WIDTH); + int currentSum = currentOffset + currentSize; + + maxOffsetSizeSum = Math.max(maxOffsetSizeSum, currentSum); + minOffset = Math.min(minOffset, currentOffset); + } + + return maxOffsetSizeSum - minOffset; + } + + /** + * Initialize the data vector (and execute callback) if it hasn't already been done, + * returns the data vector. + */ + public AddOrGetResult addOrGetVector(FieldType fieldType) { + boolean created = false; + if (vector instanceof NullVector) { + vector = fieldType.createNewSingleVector(defaultDataVectorName, allocator, repeatedCallBack); + // returned vector must have the same field + created = true; + if (repeatedCallBack != null && + // not a schema change if changing from ZeroVector to ZeroVector + (fieldType.getType().getTypeID() != ArrowType.ArrowTypeID.Null)) { + repeatedCallBack.doWork(); + } + } + + if (vector.getField().getType().getTypeID() != fieldType.getType().getTypeID()) { + final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", + fieldType.getType().getTypeID(), vector.getField().getType().getTypeID()); + throw new SchemaChangeRuntimeException(msg); + } + + return new AddOrGetResult<>((T) vector, created); + } + + protected void replaceDataVector(FieldVector v) { + vector.clear(); + vector = v; + } + + public abstract boolean isEmpty(int index); + + /** + * Start a new value at the given index. + * @param index the index to start the new value at + * @return the offset in the data vector where the new value starts + */ + public int startNewValue(int index) { + while (index >= getOffsetBufferValueCapacity()) { + reallocOffsetBuffer(); + } + while (index >= getSizeBufferValueCapacity()) { + reallocSizeBuffer(); + } + + if (index > 0) { + final int prevOffset = getLengthOfChildVectorByIndex(index); + offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset); + } + + setValueCount(index + 1); + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + @Override + @Deprecated + public UInt4Vector getOffsetVector() { + throw new UnsupportedOperationException("There is no inner offset vector"); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java new file mode 100644 index 0000000000000..b19691e7aaab7 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListViewVector.java @@ -0,0 +1,872 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex; + +import static java.util.Collections.singletonList; +import static org.apache.arrow.memory.util.LargeMemoryUtil.capAtMaxInt; +import static org.apache.arrow.memory.util.LargeMemoryUtil.checkedCastToInt; +import static org.apache.arrow.util.Preconditions.checkArgument; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.util.ArrowBufPointer; +import org.apache.arrow.memory.util.ByteFunctionHelpers; +import org.apache.arrow.memory.util.CommonUtil; +import org.apache.arrow.memory.util.hash.ArrowBufHasher; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BitVectorHelper; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListViewWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.ipc.message.ArrowFieldNode; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * A list view vector contains lists of a specific type of elements. + * Its structure contains four elements. + *
    + *
  1. A validity buffer.
  2. + *
  3. An offset buffer, that denotes lists starts.
  4. + *
  5. A size buffer, that denotes lists ends.
  6. + *
  7. A child data vector that contains the elements of lists.
  8. + *
+ * The latter three are managed by its superclass. + */ + +/* +* TODO: consider merging the functionality in `BaseRepeatedValueVector` into this class. +*/ +public class ListViewVector extends BaseRepeatedValueViewVector implements PromotableVector { + + protected ArrowBuf validityBuffer; + protected UnionListReader reader; + private CallBack callBack; + protected Field field; + protected int validityAllocationSizeInBytes; + + public static ListViewVector empty(String name, BufferAllocator allocator) { + return new ListViewVector(name, allocator, FieldType.nullable(ArrowType.ListView.INSTANCE), null); + } + + /** + * Constructs a new instance. + * + * @param name The name of the instance. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param fieldType The type of this list. + * @param callBack A schema change callback. + */ + public ListViewVector(String name, BufferAllocator allocator, FieldType fieldType, CallBack callBack) { + this(new Field(name, fieldType, null), allocator, callBack); + } + + /** + * Constructs a new instance. + * + * @param field The field materialized by this vector. + * @param allocator The allocator to use for allocating/reallocating buffers. + * @param callBack A schema change callback. + */ + public ListViewVector(Field field, BufferAllocator allocator, CallBack callBack) { + super(field.getName(), allocator, callBack); + this.validityBuffer = allocator.getEmpty(); + this.field = field; + this.callBack = callBack; + this.validityAllocationSizeInBytes = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION); + } + + @Override + public void initializeChildrenFromFields(List children) { + checkArgument(children.size() == 1, + "ListViews have one child Field. Found: %s", children.isEmpty() ? "none" : children); + + Field field = children.get(0); + AddOrGetResult addOrGetVector = addOrGetVector(field.getFieldType()); + checkArgument(addOrGetVector.isCreated(), "Child vector already existed: %s", addOrGetVector.getVector()); + + addOrGetVector.getVector().initializeChildrenFromFields(field.getChildren()); + this.field = new Field(this.field.getName(), this.field.getFieldType(), children); + } + + @Override + public void setInitialCapacity(int numRecords) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialCapacity(numRecords); + } + + /** + * Specialized version of setInitialCapacity() for ListViewVector. + * This is used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. + * This is very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. + * In such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount, but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param density density of ListViewVector. + * Density is the average size of a list per position in the ListViewVector. + * For example, a + * density value of 10 implies each position in the list + * vector has a list of 10 values. + * A density value of 0.1 implies out of 10 positions in + * the list vector, 1 position has a list of size 1, and + * the remaining positions are null (no lists) or empty lists. + * This helps in tightly controlling the memory we provision + * for inner data vector. + */ + @Override + public void setInitialCapacity(int numRecords, double density) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialCapacity(numRecords, density); + } + + /** + * Specialized version of setInitialTotalCapacity() for ListViewVector. + * This is used by some callers when they want to explicitly control and be + * conservative about memory allocated for inner data vector. + * This is very useful when we are working with memory constraints for a query + * and have a fixed amount of memory reserved for the record batch. + * In such cases, we are likely to face OOM or related problems when + * we reserve memory for a record batch with value count x and + * do setInitialCapacity(x) such that each vector allocates only + * what is necessary and not the default amount, but the multiplier + * forces the memory requirement to go beyond what was needed. + * + * @param numRecords value count + * @param totalNumberOfElements the total number of elements to allow + * for in this vector across all records. + */ + @Override + public void setInitialTotalCapacity(int numRecords, int totalNumberOfElements) { + validityAllocationSizeInBytes = getValidityBufferSizeFromCount(numRecords); + super.setInitialTotalCapacity(numRecords, totalNumberOfElements); + } + + @Override + public List getChildrenFromFields() { + return singletonList(getDataVector()); + } + + /** + * Load the buffers associated with this Field. + * @param fieldNode the fieldNode + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (ownBuffers.size() != 3) { + throw new IllegalArgumentException("Illegal buffer count, expected " + + 3 + ", got: " + ownBuffers.size()); + } + + ArrowBuf bitBuffer = ownBuffers.get(0); + ArrowBuf offBuffer = ownBuffers.get(1); + ArrowBuf szBuffer = ownBuffers.get(2); + + validityBuffer.getReferenceManager().release(); + validityBuffer = BitVectorHelper.loadValidityBuffer(fieldNode, bitBuffer, allocator); + offsetBuffer.getReferenceManager().release(); + offsetBuffer = offBuffer.getReferenceManager().retain(offBuffer, allocator); + sizeBuffer.getReferenceManager().release(); + sizeBuffer = szBuffer.getReferenceManager().retain(szBuffer, allocator); + + validityAllocationSizeInBytes = checkedCastToInt(validityBuffer.capacity()); + offsetAllocationSizeInBytes = offsetBuffer.capacity(); + sizeAllocationSizeInBytes = sizeBuffer.capacity(); + + valueCount = fieldNode.getLength(); + } + + /** + * Set the reader and writer indexes for the inner buffers. + */ + private void setReaderAndWriterIndex() { + validityBuffer.readerIndex(0); + offsetBuffer.readerIndex(0); + sizeBuffer.readerIndex(0); + if (valueCount == 0) { + validityBuffer.writerIndex(0); + offsetBuffer.writerIndex(0); + sizeBuffer.writerIndex(0); + } else { + validityBuffer.writerIndex(getValidityBufferSizeFromCount(valueCount)); + offsetBuffer.writerIndex(valueCount * OFFSET_WIDTH); + sizeBuffer.writerIndex(valueCount * SIZE_WIDTH); + } + } + + @Override + public List getFieldBuffers() { + List result = new ArrayList<>(2); + setReaderAndWriterIndex(); + result.add(validityBuffer); + result.add(offsetBuffer); + result.add(sizeBuffer); + + return result; + } + + /** + * Export the buffers of the fields for C Data Interface. + * This method traverses the buffers and export buffer and buffer's memory address into a list of + * buffers and a pointer to the list of buffers. + */ + @Override + public void exportCDataBuffers(List buffers, ArrowBuf buffersPtr, long nullValue) { + throw new UnsupportedOperationException("exportCDataBuffers Not implemented yet"); + } + + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException("Failure while allocating memory"); + } + } + + @Override + public boolean allocateNewSafe() { + boolean success = false; + try { + /* release the current buffers, hence this is a new allocation + * Note that, the `clear` method call below is releasing validityBuffer + * calling the superclass clear method which is releasing the associated buffers + * (sizeBuffer and offsetBuffer). + */ + clear(); + /* allocate validity buffer */ + allocateValidityBuffer(validityAllocationSizeInBytes); + /* allocate offset, data and sizes buffer */ + success = super.allocateNewSafe(); + } finally { + if (!success) { + clear(); + } + } + return success; + } + + protected void allocateValidityBuffer(final long size) { + final int curSize = (int) size; + validityBuffer = allocator.buffer(curSize); + validityBuffer.readerIndex(0); + validityAllocationSizeInBytes = curSize; + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + @Override + public void reAlloc() { + /* reallocate the validity buffer */ + reallocValidityBuffer(); + /* reallocate the offset, size, and data */ + super.reAlloc(); + } + + protected void reallocValidityAndSizeAndOffsetBuffers() { + reallocateBuffers(); + reallocValidityBuffer(); + } + + private void reallocValidityBuffer() { + final int currentBufferCapacity = checkedCastToInt(validityBuffer.capacity()); + long newAllocationSize = getNewAllocationSize(currentBufferCapacity); + + final ArrowBuf newBuf = allocator.buffer(newAllocationSize); + newBuf.setBytes(0, validityBuffer, 0, currentBufferCapacity); + newBuf.setZero(currentBufferCapacity, newBuf.capacity() - currentBufferCapacity); + validityBuffer.getReferenceManager().release(1); + validityBuffer = newBuf; + validityAllocationSizeInBytes = (int) newAllocationSize; + } + + private long getNewAllocationSize(int currentBufferCapacity) { + long newAllocationSize = currentBufferCapacity * 2L; + if (newAllocationSize == 0) { + if (validityAllocationSizeInBytes > 0) { + newAllocationSize = validityAllocationSizeInBytes; + } else { + newAllocationSize = getValidityBufferSizeFromCount(INITIAL_VALUE_ALLOCATION) * 2L; + } + } + newAllocationSize = CommonUtil.nextPowerOfTwo(newAllocationSize); + assert newAllocationSize >= 1; + + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer"); + } + return newAllocationSize; + } + + @Override + public void copyFromSafe(int inIndex, int outIndex, ValueVector from) { + // TODO: https://github.com/apache/arrow/issues/41270 + throw new UnsupportedOperationException( + "ListViewVector does not support copyFromSafe operation yet."); + } + + @Override + public void copyFrom(int inIndex, int outIndex, ValueVector from) { + // TODO: https://github.com/apache/arrow/issues/41270 + throw new UnsupportedOperationException( + "ListViewVector does not support copyFrom operation yet."); + } + + @Override + public FieldVector getDataVector() { + return vector; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return getTransferPair(ref, allocator, null); + } + + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator) { + return getTransferPair(field, allocator, null); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator, CallBack callBack) { + // TODO: https://github.com/apache/arrow/issues/41269 + throw new UnsupportedOperationException( + "ListVector does not support getTransferPair(String, BufferAllocator, CallBack) yet"); + } + + @Override + public TransferPair getTransferPair(Field field, BufferAllocator allocator, CallBack callBack) { + // TODO: https://github.com/apache/arrow/issues/41269 + throw new UnsupportedOperationException( + "ListVector does not support getTransferPair(Field, BufferAllocator, CallBack) yet"); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + // TODO: https://github.com/apache/arrow/issues/41269 + throw new UnsupportedOperationException( + "ListVector does not support makeTransferPair(ValueVector) yet"); + } + + @Override + public long getValidityBufferAddress() { + return validityBuffer.memoryAddress(); + } + + @Override + public long getDataBufferAddress() { + throw new UnsupportedOperationException(); + } + + @Override + public long getOffsetBufferAddress() { + return offsetBuffer.memoryAddress(); + } + + @Override + public ArrowBuf getValidityBuffer() { + return validityBuffer; + } + + @Override + public ArrowBuf getDataBuffer() { + throw new UnsupportedOperationException(); + } + + @Override + public ArrowBuf getOffsetBuffer() { + return offsetBuffer; + } + + public ArrowBuf getSizeBuffer() { + return sizeBuffer; + } + + public long getSizeBufferAddress() { + return sizeBuffer.memoryAddress(); + } + + /** + * Get the hash code for the element at the given index. + * @param index position of the element + * @return hash code for the element at the given index + */ + @Override + public int hashCode(int index) { + return hashCode(index, null); + } + + /** + * Get the hash code for the element at the given index. + * @param index position of the element + * @param hasher hasher to use + * @return hash code for the element at the given index + */ + @Override + public int hashCode(int index, ArrowBufHasher hasher) { + if (isSet(index) == 0) { + return ArrowBufPointer.NULL_HASH_CODE; + } + int hash = 0; + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = sizeBuffer.getInt(index * OFFSET_WIDTH); + for (int i = start; i < end; i++) { + hash = ByteFunctionHelpers.combineHash(hash, vector.hashCode(i, hasher)); + } + return hash; + } + + @Override + public OUT accept(VectorVisitor visitor, IN value) { + throw new UnsupportedOperationException(); + } + + @Override + protected FieldReader getReaderImpl() { + // TODO: https://github.com/apache/arrow/issues/41569 + throw new UnsupportedOperationException( + "ListViewVector does not support getReaderImpl operation yet."); + } + + @Override + public UnionListReader getReader() { + // TODO: https://github.com/apache/arrow/issues/41569 + throw new UnsupportedOperationException( + "ListViewVector does not support getReader operation yet."); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this + * vector. + * @return size of underlying buffers. + */ + @Override + public int getBufferSize() { + if (valueCount == 0) { + return 0; + } + final int offsetBufferSize = valueCount * OFFSET_WIDTH; + final int sizeBufferSize = valueCount * SIZE_WIDTH; + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + return offsetBufferSize + sizeBufferSize + validityBufferSize + vector.getBufferSize(); + } + + /** + * Get the size (number of bytes) of underlying buffers used by this. + * @param valueCount the number of values to assume this vector contains + * @return size of underlying buffers. + */ + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + final int validityBufferSize = getValidityBufferSizeFromCount(valueCount); + + return super.getBufferSizeFor(valueCount) + validityBufferSize; + } + + /** + * Get the field associated with the list view vector. + * @return the field + */ + @Override + public Field getField() { + if (field.getChildren().contains(getDataVector().getField())) { + return field; + } + field = new Field(field.getName(), field.getFieldType(), Collections.singletonList(getDataVector().getField())); + return field; + } + + /** + * Get the minor type for the vector. + * @return the minor type + */ + @Override + public MinorType getMinorType() { + return MinorType.LISTVIEW; + } + + /** + * Clear the vector data. + */ + @Override + public void clear() { + // calling superclass clear method which is releasing the sizeBufer and offsetBuffer + super.clear(); + validityBuffer = releaseBuffer(validityBuffer); + } + + /** + * Release the buffers associated with this vector. + */ + @Override + public void reset() { + super.reset(); + validityBuffer.setZero(0, validityBuffer.capacity()); + } + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't + * impact the reference counts for this buffer, so it only should be used for in-context + * access. Also note that this buffer changes regularly, thus + * external classes shouldn't hold a reference to it (unless they change it). + * + * @param clear Whether to clear vector before returning, the buffers will still be refcounted + * but the returned array will be the only reference to them + * @return The underlying {@link ArrowBuf buffers} that is used by this + * vector instance. + */ + @Override + public ArrowBuf[] getBuffers(boolean clear) { + setReaderAndWriterIndex(); + final ArrowBuf[] buffers; + if (getBufferSize() == 0) { + buffers = new ArrowBuf[0]; + } else { + List list = new ArrayList<>(); + // the order must be validity, offset and size buffers + list.add(validityBuffer); + list.add(offsetBuffer); + list.add(sizeBuffer); + list.addAll(Arrays.asList(vector.getBuffers(false))); + buffers = list.toArray(new ArrowBuf[list.size()]); + } + if (clear) { + for (ArrowBuf buffer : buffers) { + buffer.getReferenceManager().retain(); + } + clear(); + } + return buffers; + } + + /** + * Get the element in the list view vector at a particular index. + * @param index position of the element + * @return Object at given position + */ + @Override + public List getObject(int index) { + if (isSet(index) == 0) { + return null; + } + final List vals = new JsonStringArrayList<>(); + final int start = offsetBuffer.getInt(index * OFFSET_WIDTH); + final int end = start + sizeBuffer.getInt((index) * SIZE_WIDTH); + final ValueVector vv = getDataVector(); + for (int i = start; i < end; i++) { + vals.add(vv.getObject(i)); + } + + return vals; + } + + /** + * Check if an element at given index is null. + * + * @param index position of an element + * @return true if an element at given index is null, false otherwise + */ + @Override + public boolean isNull(int index) { + return (isSet(index) == 0); + } + + /** + * Check if an element at given index is an empty list. + * @param index position of an element + * @return true if an element at given index is an empty list or NULL, false otherwise + */ + @Override + public boolean isEmpty(int index) { + if (isNull(index)) { + return true; + } else { + return sizeBuffer.getInt(index * SIZE_WIDTH) == 0; + } + } + + /** + * Same as {@link #isNull(int)}. + * + * @param index position of the element + * @return 1 if element at given index is not null, 0 otherwise + */ + public int isSet(int index) { + final int byteIndex = index >> 3; + final byte b = validityBuffer.getByte(byteIndex); + final int bitIndex = index & 7; + return (b >> bitIndex) & 0x01; + } + + /** + * Get the number of elements that are null in the vector. + * + * @return the number of null elements. + */ + @Override + public int getNullCount() { + return BitVectorHelper.getNullCount(validityBuffer, valueCount); + } + + /** + * Get the value capacity by considering validity and offset capacity. + * Note that the size buffer capacity is not considered here since it has + * the same capacity as the offset buffer. + * + * @return the value capacity + */ + @Override + public int getValueCapacity() { + return getValidityAndOffsetValueCapacity(); + } + + private int getValidityAndSizeValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity(), 0); + final int sizeValueCapacity = Math.max(getSizeBufferValueCapacity(), 0); + return Math.min(offsetValueCapacity, sizeValueCapacity); + } + + private int getValidityAndOffsetValueCapacity() { + final int offsetValueCapacity = Math.max(getOffsetBufferValueCapacity(), 0); + return Math.min(offsetValueCapacity, getValidityBufferValueCapacity()); + } + + private int getValidityBufferValueCapacity() { + return capAtMaxInt(validityBuffer.capacity() * 8); + } + + /** + * Set the element at the given index to null. + * @param index the value to change + */ + @Override + public void setNull(int index) { + while (index >= getValidityAndSizeValueCapacity()) { + reallocValidityAndSizeAndOffsetBuffers(); + } + + offsetBuffer.setInt(index * OFFSET_WIDTH, 0); + sizeBuffer.setInt(index * SIZE_WIDTH, 0); + BitVectorHelper.unsetBit(validityBuffer, index); + } + + /** + * Start new value in the ListView vector. + * + * @param index index of the value to start + * @return offset of the new value + */ + @Override + public int startNewValue(int index) { + while (index >= getValidityAndSizeValueCapacity()) { + reallocValidityAndSizeAndOffsetBuffers(); + } + + if (index > 0) { + final int prevOffset = getLengthOfChildVectorByIndex(index); + offsetBuffer.setInt(index * OFFSET_WIDTH, prevOffset); + } + + BitVectorHelper.setBit(validityBuffer, index); + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + /** + * Validate the invariants of the offset and size buffers. + * 0 <= offsets[i] <= length of the child array + * 0 <= offsets[i] + size[i] <= length of the child array + * @param offset the offset at a given index + * @param size the size at a given index + */ + private void validateInvariants(int offset, int size) { + if (offset < 0) { + throw new IllegalArgumentException("Offset cannot be negative"); + } + + if (size < 0) { + throw new IllegalArgumentException("Size cannot be negative"); + } + + // 0 <= offsets[i] <= length of the child array + if (offset > this.vector.getValueCount()) { + throw new IllegalArgumentException("Offset is out of bounds."); + } + + // 0 <= offsets[i] + size[i] <= length of the child array + if (offset + size > this.vector.getValueCount()) { + throw new IllegalArgumentException("Offset + size <= length of the child array."); + } + } + + /** + * Set the offset at the given index. + * Make sure to use this function after updating `field` vector and using `setValidity` + * @param index index of the value to set + * @param value value to set + */ + public void setOffset(int index, int value) { + validateInvariants(value, sizeBuffer.getInt(index * SIZE_WIDTH)); + + offsetBuffer.setInt(index * OFFSET_WIDTH, value); + } + + /** + * Set the size at the given index. + * Make sure to use this function after using `setOffset`. + * @param index index of the value to set + * @param value value to set + */ + public void setSize(int index, int value) { + validateInvariants(offsetBuffer.getInt(index * SIZE_WIDTH), value); + + sizeBuffer.setInt(index * SIZE_WIDTH, value); + } + + /** + * Set the validity at the given index. + * @param index index of the value to set + * @param value value to set (0 for unset and 1 for a set) + */ + public void setValidity(int index, int value) { + if (value == 0) { + BitVectorHelper.unsetBit(validityBuffer, index); + } else { + BitVectorHelper.setBit(validityBuffer, index); + } + } + + @Override + public void setValueCount(int valueCount) { + this.valueCount = valueCount; + if (valueCount > 0) { + while (valueCount > getValidityAndSizeValueCapacity()) { + /* check if validity and offset buffers need to be re-allocated */ + reallocValidityAndSizeAndOffsetBuffers(); + } + } + /* valueCount for the data vector is the current end offset */ + final int childValueCount = (valueCount == 0) ? 0 : getLengthOfChildVector(); + /* set the value count of data vector and this will take care of + * checking whether data buffer needs to be reallocated. + */ + vector.setValueCount(childValueCount); + } + + @Override + public int getElementStartIndex(int index) { + return offsetBuffer.getInt(index * OFFSET_WIDTH); + } + + @Override + public int getElementEndIndex(int index) { + return sizeBuffer.getInt(index * OFFSET_WIDTH); + } + + @Override + public AddOrGetResult addOrGetVector(FieldType fieldType) { + AddOrGetResult result = super.addOrGetVector(fieldType); + invalidateReader(); + return result; + } + + @Override + public UnionVector promoteToUnion() { + UnionVector vector = new UnionVector("$data$", allocator, /* field type*/ null, callBack); + replaceDataVector(vector); + invalidateReader(); + if (callBack != null) { + callBack.doWork(); + } + return vector; + } + + private void invalidateReader() { + reader = null; + } + + @Deprecated + @Override + public List getFieldInnerVectors() { + throw new UnsupportedOperationException("There are no inner vectors. Use getFieldBuffers"); + } + + public UnionListViewWriter getWriter() { + return new UnionListViewWriter(this); + } + + @Override + public int getValueCount() { + return valueCount; + } + + /** + * Get the density of this ListVector. + * @return density + */ + public double getDensity() { + if (valueCount == 0) { + return 0.0D; + } + final double totalListSize = getLengthOfChildVector(); + return totalListSize / valueCount; + } + + /** + * Validating ListViewVector creation based on the specification guideline. + */ + @Override + public void validate() { + for (int i = 0; i < valueCount; i++) { + final int offset = offsetBuffer.getInt(i * OFFSET_WIDTH); + final int size = sizeBuffer.getInt(i * SIZE_WIDTH); + validateInvariants(offset, size); + } + } + + /** + * End the current value. + * + * @param index index of the value to end + * @param size number of elements in the list that was written + */ + public void endValue(int index, int size) { + sizeBuffer.setInt(index * SIZE_WIDTH, size); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index 7f724829ef1eb..c59b997286d2d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -29,6 +29,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; @@ -54,6 +55,7 @@ public class PromotableWriter extends AbstractPromotableFieldWriter { private final AbstractStructVector parentContainer; private final ListVector listVector; + private final ListViewVector listViewVector; private final FixedSizeListVector fixedListVector; private final LargeListVector largeListVector; private final NullableStructWriterFactory nullableStructWriterFactory; @@ -94,6 +96,7 @@ public PromotableWriter( NullableStructWriterFactory nullableStructWriterFactory) { this.parentContainer = parentContainer; this.listVector = null; + this.listViewVector = null; this.fixedListVector = null; this.largeListVector = null; this.nullableStructWriterFactory = nullableStructWriterFactory; @@ -142,6 +145,27 @@ public PromotableWriter( ListVector listVector, NullableStructWriterFactory nullableStructWriterFactory) { this.listVector = listVector; + this.listViewVector = null; + this.parentContainer = null; + this.fixedListVector = null; + this.largeListVector = null; + this.nullableStructWriterFactory = nullableStructWriterFactory; + init(v); + } + + /** + * Constructs a new instance. + * + * @param v The vector to initialize the writer with. + * @param listViewVector The vector that serves as a parent of v. + * @param nullableStructWriterFactory The factory to create the delegate writer. + */ + public PromotableWriter( + ValueVector v, + ListViewVector listViewVector, + NullableStructWriterFactory nullableStructWriterFactory) { + this.listViewVector = listViewVector; + this.listVector = null; this.parentContainer = null; this.fixedListVector = null; this.largeListVector = null; @@ -163,6 +187,7 @@ public PromotableWriter( this.fixedListVector = fixedListVector; this.parentContainer = null; this.listVector = null; + this.listViewVector = null; this.largeListVector = null; this.nullableStructWriterFactory = nullableStructWriterFactory; init(v); @@ -183,6 +208,7 @@ public PromotableWriter( this.fixedListVector = null; this.parentContainer = null; this.listVector = null; + this.listViewVector = null; this.nullableStructWriterFactory = nullableStructWriterFactory; init(v); } @@ -280,6 +306,8 @@ protected FieldWriter getWriter(MinorType type, ArrowType arrowType) { v = listVector.addOrGetVector(fieldType).getVector(); } else if (fixedListVector != null) { v = fixedListVector.addOrGetVector(fieldType).getVector(); + } else if (listViewVector != null) { + v = listViewVector.addOrGetVector(fieldType).getVector(); } else { v = largeListVector.addOrGetVector(fieldType).getVector(); } @@ -322,6 +350,8 @@ private FieldWriter promoteToUnion() { unionVector = fixedListVector.promoteToUnion(); } else if (largeListVector != null) { unionVector = largeListVector.promoteToUnion(); + } else if (listViewVector != null) { + unionVector = listViewVector.promoteToUnion(); } unionVector.addVector((FieldVector) tp.getTo()); writer = new UnionWriter(unionVector, nullableStructWriterFactory); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 89d8441d42aa9..e10a65e3b2c53 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -71,6 +71,7 @@ import org.apache.arrow.vector.complex.FixedSizeListVector; import org.apache.arrow.vector.complex.LargeListVector; import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.complex.UnionVector; @@ -136,6 +137,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.ArrowType.Struct; @@ -692,6 +694,20 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new UnionListWriter((ListVector) vector); } }, + LISTVIEW(ListView.INSTANCE) { + @Override + public FieldVector getNewVector( + Field field, + BufferAllocator allocator, + CallBack schemaChangeCallback) { + return new ListViewVector(field.getName(), allocator, field.getFieldType(), schemaChangeCallback); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionListWriter((ListVector) vector); + } + }, LARGELIST(ArrowType.LargeList.INSTANCE) { @Override public FieldVector getNewVector(Field field, BufferAllocator allocator, CallBack schemaChangeCallback) { @@ -1064,6 +1080,11 @@ public MinorType visit(Duration type) { return MinorType.DURATION; } + @Override + public MinorType visit(ListView type) { + return MinorType.LISTVIEW; + } + @Override public MinorType visit(ExtensionType type) { return MinorType.EXTENSIONTYPE; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java new file mode 100644 index 0000000000000..e64ed77b1eb9f --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListViewVector.java @@ -0,0 +1,1651 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.BaseRepeatedValueVector; +import org.apache.arrow.vector.complex.BaseRepeatedValueViewVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.ListViewVector; +import org.apache.arrow.vector.complex.impl.UnionListViewWriter; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.holders.DurationHolder; +import org.apache.arrow.vector.holders.TimeStampMilliTZHolder; +import org.apache.arrow.vector.types.TimeUnit; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestListViewVector { + + private BufferAllocator allocator; + + @BeforeEach + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @AfterEach + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testBasicListViewVector() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + /* allocate memory */ + listViewWriter.allocate(); + + /* write the first list at index 0 */ + listViewWriter.setPosition(0); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.bigInt().writeBigInt(-7); + listViewWriter.bigInt().writeBigInt(25); + listViewWriter.endList(); + + /* the second list at index 1 is null (we are not setting any)*/ + + /* write the third list at index 2 */ + listViewWriter.setPosition(2); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(0); + listViewWriter.bigInt().writeBigInt(-127); + listViewWriter.bigInt().writeBigInt(127); + listViewWriter.bigInt().writeBigInt(50); + listViewWriter.endList(); + + /* write the fourth list at index 3 (empty list) */ + listViewWriter.setPosition(3); + listViewWriter.startList(); + listViewWriter.endList(); + + /* write the fifth list at index 4 */ + listViewWriter.setPosition(4); + listViewWriter.startList(); + listViewWriter.bigInt().writeBigInt(1); + listViewWriter.bigInt().writeBigInt(2); + listViewWriter.bigInt().writeBigInt(3); + listViewWriter.bigInt().writeBigInt(4); + listViewWriter.endList(); + + listViewVector.setValueCount(5); + // check value count + assertEquals(5, listViewVector.getValueCount()); + + /* get vector at index 0 -- the value is a BigIntVector*/ + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + final FieldVector dataVec = listViewVector.getDataVector(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check data vector + assertEquals(12, ((BigIntVector) dataVec).get(0)); + assertEquals(-7, ((BigIntVector) dataVec).get(1)); + assertEquals(25, ((BigIntVector) dataVec).get(2)); + assertEquals(0, ((BigIntVector) dataVec).get(3)); + assertEquals(-127, ((BigIntVector) dataVec).get(4)); + assertEquals(127, ((BigIntVector) dataVec).get(5)); + assertEquals(50, ((BigIntVector) dataVec).get(6)); + assertEquals(1, ((BigIntVector) dataVec).get(7)); + assertEquals(2, ((BigIntVector) dataVec).get(8)); + assertEquals(3, ((BigIntVector) dataVec).get(9)); + assertEquals(4, ((BigIntVector) dataVec).get(10)); + + listViewVector.validate(); + } + } + + @Test + public void testImplicitNullVectors() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + /* allocate memory */ + listViewWriter.allocate(); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + /* write the first list at index 0 */ + listViewWriter.setPosition(0); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.bigInt().writeBigInt(-7); + listViewWriter.bigInt().writeBigInt(25); + listViewWriter.endList(); + + int offSet0 = offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size0 = sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH); + + // after the first list is written, + // the initial offset must be 0, + // the size must be 3 (as there are 3 elements in the array), + // the lastSet must be 0 since, the first list is written at index 0. + + assertEquals(0, offSet0); + assertEquals(3, size0); + + listViewWriter.setPosition(5); + listViewWriter.startList(); + + // writing the 6th list at index 5, + // and the list items from index 1 through 4 are not populated. + // but since there is a gap between the 0th and 5th list, in terms + // of buffer allocation, the offset and size buffers must be updated + // to reflect the implicit null vectors. + + for (int i = 1; i < 5; i++) { + int offSet = offSetBuffer.getInt(i * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size = sizeBuffer.getInt(i * BaseRepeatedValueViewVector.SIZE_WIDTH); + // Since the list is not written, the offset and size must equal to child vector's size + // i.e., 3, and size should be 0 as the list is not written. + // And the last set value is the value currently being written, which is 5. + assertEquals(0, offSet); + assertEquals(0, size); + } + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.bigInt().writeBigInt(25); + listViewWriter.endList(); + + int offSet5 = offSetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size5 = sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH); + + assertEquals(3, offSet5); + assertEquals(2, size5); + + listViewWriter.setPosition(10); + listViewWriter.startList(); + + // writing the 11th list at index 10, + // and the list items from index 6 through 10 are not populated. + // but since there is a gap between the 5th and 11th list, in terms + // of buffer allocation, the offset and size buffers must be updated + // to reflect the implicit null vectors. + for (int i = 6; i < 10; i++) { + int offSet = offSetBuffer.getInt(i * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size = sizeBuffer.getInt(i * BaseRepeatedValueViewVector.SIZE_WIDTH); + // Since the list is not written, the offset and size must equal to 0 + // and size should be 0 as the list is not written. + // And the last set value is the value currently being written, which is 10. + assertEquals(0, offSet); + assertEquals(0, size); + } + + listViewWriter.bigInt().writeBigInt(12); + listViewWriter.endList(); + + int offSet11 = offSetBuffer.getInt(10 * BaseRepeatedValueViewVector.OFFSET_WIDTH); + int size11 = sizeBuffer.getInt(10 * BaseRepeatedValueViewVector.SIZE_WIDTH); + + assertEquals(5, offSet11); + assertEquals(1, size11); + + listViewVector.setValueCount(11); + + listViewVector.validate(); + } + } + + @Test + public void testNestedListViewVector() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + /* allocate memory */ + listViewWriter.allocate(); + + /* the dataVector that backs a listVector will also be a + * listVector for this test. + */ + + /* write one or more inner lists at index 0 */ + listViewWriter.setPosition(0); + listViewWriter.startList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(50); + listViewWriter.list().bigInt().writeBigInt(100); + listViewWriter.list().bigInt().writeBigInt(200); + listViewWriter.list().endList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(75); + listViewWriter.list().bigInt().writeBigInt(125); + listViewWriter.list().bigInt().writeBigInt(150); + listViewWriter.list().bigInt().writeBigInt(175); + listViewWriter.list().endList(); + + listViewWriter.endList(); + + /* write one or more inner lists at index 1 */ + listViewWriter.setPosition(1); + listViewWriter.startList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(10); + listViewWriter.list().endList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(15); + listViewWriter.list().bigInt().writeBigInt(20); + listViewWriter.list().endList(); + + listViewWriter.list().startList(); + listViewWriter.list().bigInt().writeBigInt(25); + listViewWriter.list().bigInt().writeBigInt(30); + listViewWriter.list().bigInt().writeBigInt(35); + listViewWriter.list().endList(); + + listViewWriter.endList(); + + listViewVector.setValueCount(2); + + // [[[50,100,200],[75,125,150,175]], [[10],[15,20],[25,30,35]]] + + assertEquals(2, listViewVector.getValueCount()); + + /* get listViewVector value at index 0 -- the value itself is a listViewVector */ + Object result = listViewVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of the second inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(50), list.get(0)); + assertEquals(Long.valueOf(100), list.get(1)); + assertEquals(Long.valueOf(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(75), list.get(0)); + assertEquals(Long.valueOf(125), list.get(1)); + assertEquals(Long.valueOf(150), list.get(2)); + assertEquals(Long.valueOf(175), list.get(3)); + + /* get listViewVector value at index 1 -- the value itself is a listViewVector */ + result = listViewVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of the second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of the third inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(15), list.get(0)); + assertEquals(Long.valueOf(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(Long.valueOf(25), list.get(0)); + assertEquals(Long.valueOf(30), list.get(1)); + assertEquals(Long.valueOf(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listViewVector.isNull(0)); + assertFalse(listViewVector.isNull(1)); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + listViewVector.validate(); + } + } + + @Test + public void testNestedListVector() throws Exception { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + + MinorType listType = MinorType.LISTVIEW; + MinorType scalarType = MinorType.BIGINT; + + listViewVector.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList1 = (ListViewVector) listViewVector.getDataVector(); + innerList1.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList2 = (ListViewVector) innerList1.getDataVector(); + innerList2.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList3 = (ListViewVector) innerList2.getDataVector(); + innerList3.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList4 = (ListViewVector) innerList3.getDataVector(); + innerList4.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList5 = (ListViewVector) innerList4.getDataVector(); + innerList5.addOrGetVector(FieldType.nullable(listType.getType())); + + ListViewVector innerList6 = (ListViewVector) innerList5.getDataVector(); + innerList6.addOrGetVector(FieldType.nullable(scalarType.getType())); + + listViewVector.setInitialCapacity(128); + + listViewVector.validate(); + } + } + + private void setValuesInBuffer(int[] bufValues, ArrowBuf buffer, long bufWidth) { + for (int i = 0; i < bufValues.length; i++) { + buffer.setInt(i * bufWidth, bufValues[i]); + } + } + + /* + * Setting up the buffers directly needs to be validated with the base method used in + * the ListVector class where we use the approach of startList(), + * write to the child vector and endList(). + *

+ * To support this, we have to consider the following scenarios; + *

+ * 1. Only using directly buffer-based inserts. + * 2. Default list insertion followed by buffer-based inserts. + * 3. Buffer-based inserts followed by default list insertion. + */ + + /* Setting up buffers directly would require the following steps to be taken + * 0. Allocate buffers in listViewVector by calling `allocateNew` method. + * 1. Initialize the child vector using `initializeChildrenFromFields` method. + * 2. Set values in the child vector. + * 3. Set validity, offset and size buffers using `setValidity`, + * `setOffset` and `setSize` methods. + * 4. Set value count using `setValueCount` method. + */ + @Test + public void testBasicListViewSet() { + + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + FieldType fieldType = new FieldType(true, new ArrowType.Int(64, true), + null, null); + Field field = new Field("child-vector", fieldType, null); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + BigIntVector childVector = (BigIntVector) fieldVector; + childVector.allocateNew(7); + + childVector.set(0, 12); + childVector.set(1, -7); + childVector.set(2, 25); + childVector.set(3, 0); + childVector.set(4, -127); + childVector.set(5, 127); + childVector.set(6, 50); + + childVector.setValueCount(7); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + listViewVector.setOffset(0, 0); + listViewVector.setOffset(1, 3); + listViewVector.setOffset(2, 3); + listViewVector.setOffset(3, 7); + + listViewVector.setSize(0, 3); + listViewVector.setSize(1, 0); + listViewVector.setSize(2, 4); + listViewVector.setSize(3, 0); + + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 0); + listViewVector.setValidity(2, 1); + listViewVector.setValidity(3, 1); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(4); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check values + assertEquals(12, ((BigIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-7, ((BigIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(25, ((BigIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(0, ((BigIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(-127, ((BigIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(127, ((BigIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(50, ((BigIntVector) listViewVector.getDataVector()).get(6)); + + listViewVector.validate(); + } + } + + @Test + public void testBasicListViewSetNested() { + // Expected listview + // [[[50,100,200],[75,125,150,175]],[[10],[15,20],[25,30,35]]] + + // Setting child vector + // [[50,100,200],[75,125,150,175],[10],[15,20],[25,30,35]] + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + FieldType fieldType = new FieldType(true, new ArrowType.List(), + null, null); + FieldType childFieldType = new FieldType(true, new ArrowType.Int(64, true), + null, null); + Field childField = new Field("child-vector", childFieldType, null); + List children = new ArrayList<>(); + children.add(childField); + Field field = new Field("child-vector", fieldType, children); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + ListVector childVector = (ListVector) fieldVector; + UnionListWriter listWriter = childVector.getWriter(); + listWriter.allocate(); + + listWriter.setPosition(0); + listWriter.startList(); + + listWriter.bigInt().writeBigInt(50); + listWriter.bigInt().writeBigInt(100); + listWriter.bigInt().writeBigInt(200); + + listWriter.endList(); + + listWriter.setPosition(1); + listWriter.startList(); + + listWriter.bigInt().writeBigInt(75); + listWriter.bigInt().writeBigInt(125); + listWriter.bigInt().writeBigInt(150); + listWriter.bigInt().writeBigInt(175); + + listWriter.endList(); + + listWriter.setPosition(2); + listWriter.startList(); + + listWriter.bigInt().writeBigInt(10); + + listWriter.endList(); + + listWriter.startList(); + listWriter.setPosition(3); + + listWriter.bigInt().writeBigInt(15); + listWriter.bigInt().writeBigInt(20); + + listWriter.endList(); + + listWriter.startList(); + listWriter.setPosition(4); + + listWriter.bigInt().writeBigInt(25); + listWriter.bigInt().writeBigInt(30); + listWriter.bigInt().writeBigInt(35); + + listWriter.endList(); + + childVector.setValueCount(5); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 1); + + listViewVector.setOffset(0, 0); + listViewVector.setOffset(1, 2); + + listViewVector.setSize(0, 2); + listViewVector.setSize(1, 3); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(2); + + assertEquals(2, listViewVector.getValueCount()); + + /* get listViewVector value at index 0 -- the value itself is a listViewVector */ + Object result = listViewVector.getObject(0); + ArrayList> resultSet = (ArrayList>) result; + ArrayList list; + + assertEquals(2, resultSet.size()); /* 2 inner lists at index 0 */ + assertEquals(3, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(4, resultSet.get(1).size()); /* size of the second inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(50), list.get(0)); + assertEquals(Long.valueOf(100), list.get(1)); + assertEquals(Long.valueOf(200), list.get(2)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(75), list.get(0)); + assertEquals(Long.valueOf(125), list.get(1)); + assertEquals(Long.valueOf(150), list.get(2)); + assertEquals(Long.valueOf(175), list.get(3)); + + /* get listViewVector value at index 1 -- the value itself is a listViewVector */ + result = listViewVector.getObject(1); + resultSet = (ArrayList>) result; + + assertEquals(3, resultSet.size()); /* 3 inner lists at index 1 */ + assertEquals(1, resultSet.get(0).size()); /* size of the first inner list */ + assertEquals(2, resultSet.get(1).size()); /* size of the second inner list */ + assertEquals(3, resultSet.get(2).size()); /* size of the third inner list */ + + list = resultSet.get(0); + assertEquals(Long.valueOf(10), list.get(0)); + + list = resultSet.get(1); + assertEquals(Long.valueOf(15), list.get(0)); + assertEquals(Long.valueOf(20), list.get(1)); + + list = resultSet.get(2); + assertEquals(Long.valueOf(25), list.get(0)); + assertEquals(Long.valueOf(30), list.get(1)); + assertEquals(Long.valueOf(35), list.get(2)); + + /* check underlying bitVector */ + assertFalse(listViewVector.isNull(0)); + assertFalse(listViewVector.isNull(1)); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + listViewVector.validate(); + } + } + + @Test + public void testBasicListViewSetWithListViewWriter() { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + FieldType fieldType = new FieldType(true, new ArrowType.Int(64, true), + null, null); + Field field = new Field("child-vector", fieldType, null); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + BigIntVector childVector = (BigIntVector) fieldVector; + childVector.allocateNew(7); + + childVector.set(0, 12); + childVector.set(1, -7); + childVector.set(2, 25); + childVector.set(3, 0); + childVector.set(4, -127); + childVector.set(5, 127); + childVector.set(6, 50); + + childVector.setValueCount(7); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 0); + listViewVector.setValidity(2, 1); + listViewVector.setValidity(3, 1); + + listViewVector.setOffset(0, 0); + listViewVector.setOffset(1, 3); + listViewVector.setOffset(2, 3); + listViewVector.setOffset(3, 7); + + listViewVector.setSize(0, 3); + listViewVector.setSize(1, 0); + listViewVector.setSize(2, 4); + listViewVector.setSize(3, 0); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(4); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check values + assertEquals(12, ((BigIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-7, ((BigIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(25, ((BigIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(0, ((BigIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(-127, ((BigIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(127, ((BigIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(50, ((BigIntVector) listViewVector.getDataVector()).get(6)); + + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + + listViewWriter.setPosition(4); + listViewWriter.startList(); + + listViewWriter.bigInt().writeBigInt(121); + listViewWriter.bigInt().writeBigInt(-71); + listViewWriter.bigInt().writeBigInt(251); + listViewWriter.endList(); + + listViewVector.setValueCount(5); + + // check offset buffer + assertEquals(0, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check values + assertEquals(12, ((BigIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-7, ((BigIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(25, ((BigIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(0, ((BigIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(-127, ((BigIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(127, ((BigIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(50, ((BigIntVector) listViewVector.getDataVector()).get(6)); + assertEquals(121, ((BigIntVector) listViewVector.getDataVector()).get(7)); + assertEquals(-71, ((BigIntVector) listViewVector.getDataVector()).get(8)); + assertEquals(251, ((BigIntVector) listViewVector.getDataVector()).get(9)); + + listViewVector.validate(); + } + } + + @Test + public void testGetBufferAddress() throws Exception { + try (ListViewVector listViewVector = ListViewVector.empty("vector", allocator)) { + + UnionListViewWriter listViewWriter = listViewVector.getWriter(); + boolean error = false; + + listViewWriter.allocate(); + + listViewWriter.setPosition(0); + listViewWriter.startList(); + listViewWriter.bigInt().writeBigInt(50); + listViewWriter.bigInt().writeBigInt(100); + listViewWriter.bigInt().writeBigInt(200); + listViewWriter.endList(); + + listViewWriter.setPosition(1); + listViewWriter.startList(); + listViewWriter.bigInt().writeBigInt(250); + listViewWriter.bigInt().writeBigInt(300); + listViewWriter.endList(); + + listViewVector.setValueCount(2); + + /* check listVector contents */ + Object result = listViewVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(50), resultSet.get(0)); + assertEquals(Long.valueOf(100), resultSet.get(1)); + assertEquals(Long.valueOf(200), resultSet.get(2)); + + result = listViewVector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(250), resultSet.get(0)); + assertEquals(Long.valueOf(300), resultSet.get(1)); + + List buffers = listViewVector.getFieldBuffers(); + + long bitAddress = listViewVector.getValidityBufferAddress(); + long offsetAddress = listViewVector.getOffsetBufferAddress(); + long sizeAddress = listViewVector.getSizeBufferAddress(); + + try { + listViewVector.getDataBufferAddress(); + } catch (UnsupportedOperationException ue) { + error = true; + } finally { + assertTrue(error); + } + + assertEquals(3, buffers.size()); + assertEquals(bitAddress, buffers.get(0).memoryAddress()); + assertEquals(offsetAddress, buffers.get(1).memoryAddress()); + assertEquals(sizeAddress, buffers.get(2).memoryAddress()); + + /* (3+2)/2 */ + assertEquals(2.5, listViewVector.getDensity(), 0); + listViewVector.validate(); + } + } + + @Test + public void testConsistentChildName() throws Exception { + try (ListViewVector listViewVector = ListViewVector.empty("sourceVector", allocator)) { + String emptyListStr = listViewVector.getField().toString(); + assertTrue(emptyListStr.contains(ListVector.DATA_VECTOR_NAME)); + + listViewVector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + String emptyVectorStr = listViewVector.getField().toString(); + assertTrue(emptyVectorStr.contains(ListVector.DATA_VECTOR_NAME)); + } + } + + @Test + public void testSetInitialCapacity() { + try (final ListViewVector vector = ListViewVector.empty("", allocator)) { + vector.addOrGetVector(FieldType.nullable(MinorType.INT.getType())); + + vector.setInitialCapacity(512); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512); + + vector.setInitialCapacity(512, 4); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 512 * 4); + + vector.setInitialCapacity(512, 0.1); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 51); + + vector.setInitialCapacity(512, 0.01); + vector.allocateNew(); + assertEquals(512, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 5); + + vector.setInitialCapacity(5, 0.1); + vector.allocateNew(); + assertEquals(8, vector.getValueCapacity()); + assertTrue(vector.getDataVector().getValueCapacity() >= 1); + + vector.validate(); + } + } + + @Test + public void testClearAndReuse() { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + BigIntVector bigIntVector = + (BigIntVector) vector.addOrGetVector(FieldType.nullable(MinorType.BIGINT.getType())).getVector(); + vector.setInitialCapacity(10); + vector.allocateNew(); + + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(Long.valueOf(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(Long.valueOf(8), resultSet.get(0)); + + // Clear and release the buffers to trigger a realloc when adding next value + vector.clear(); + + // The list vector should reuse a buffer when reallocating the offset buffer + vector.startNewValue(0); + bigIntVector.setSafe(0, 7); + vector.endValue(0, 1); + vector.startNewValue(1); + bigIntVector.setSafe(1, 8); + vector.endValue(1, 1); + vector.setValueCount(2); + + result = vector.getObject(0); + resultSet = (ArrayList) result; + assertEquals(Long.valueOf(7), resultSet.get(0)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(Long.valueOf(8), resultSet.get(0)); + + vector.validate(); + } + } + + @Test + public void testWriterGetField() { + // adopted from ListVector test cases + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Int(32, true)), null); + Field expectedField = new Field(vector.getName(), FieldType.nullable(ArrowType.ListView.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + + vector.validate(); + } + } + + @Test + public void testWriterUsingHolderGetTimestampMilliTZField() { + // adopted from ListVector test cases + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + org.apache.arrow.vector.complex.writer.FieldWriter writer = vector.getWriter(); + writer.allocate(); + + TimeStampMilliTZHolder holder = new TimeStampMilliTZHolder(); + holder.timezone = "SomeFakeTimeZone"; + writer.startList(); + holder.value = 12341234L; + writer.timeStampMilliTZ().write(holder); + holder.value = 55555L; + writer.timeStampMilliTZ().write(holder); + + // Writing with a different timezone should throw + holder.timezone = "AsdfTimeZone"; + holder.value = 77777; + IllegalArgumentException ex = assertThrows(IllegalArgumentException.class, + () -> writer.timeStampMilliTZ().write(holder)); + assertEquals( + "holder.timezone: AsdfTimeZone not equal to vector timezone: SomeFakeTimeZone", + ex.getMessage()); + + writer.endList(); + vector.setValueCount(1); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Timestamp(TimeUnit.MILLISECOND, "SomeFakeTimeZone")), null); + Field expectedField = new Field(vector.getName(), FieldType.nullable(ArrowType.ListView.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + + vector.validate(); + } + } + + @Test + public void testWriterGetDurationField() { + // adopted from ListVector test cases + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + org.apache.arrow.vector.complex.writer.FieldWriter writer = vector.getWriter(); + writer.allocate(); + + DurationHolder durationHolder = new DurationHolder(); + durationHolder.unit = TimeUnit.MILLISECOND; + + writer.startList(); + durationHolder.value = 812374L; + writer.duration().write(durationHolder); + durationHolder.value = 143451L; + writer.duration().write(durationHolder); + + // Writing with a different unit should throw + durationHolder.unit = TimeUnit.SECOND; + durationHolder.value = 8888888; + IllegalArgumentException ex = assertThrows(IllegalArgumentException.class, + () -> writer.duration().write(durationHolder)); + assertEquals( + "holder.unit: SECOND not equal to vector unit: MILLISECOND", ex.getMessage()); + + writer.endList(); + vector.setValueCount(1); + + Field expectedDataField = new Field(BaseRepeatedValueVector.DATA_VECTOR_NAME, + FieldType.nullable(new ArrowType.Duration(TimeUnit.MILLISECOND)), null); + Field expectedField = new Field(vector.getName(), + FieldType.nullable(ArrowType.ListView.INSTANCE), + Arrays.asList(expectedDataField)); + + assertEquals(expectedField, writer.getField()); + + vector.validate(); + } + } + + @Test + public void testClose() throws Exception { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writer.startList(); + writer.integer().writeInt(1); + writer.integer().writeInt(2); + writer.endList(); + vector.setValueCount(2); + + assertTrue(vector.getBufferSize() > 0); + assertTrue(vector.getDataVector().getBufferSize() > 0); + + writer.close(); + assertEquals(0, vector.getBufferSize()); + assertEquals(0, vector.getDataVector().getBufferSize()); + + vector.validate(); + } + } + + @Test + public void testGetBufferSizeFor() { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + //set some values + writeIntValues(writer, new int[] {1, 2}); + writeIntValues(writer, new int[] {3, 4}); + writeIntValues(writer, new int[] {5, 6}); + writeIntValues(writer, new int[] {7, 8, 9, 10}); + writeIntValues(writer, new int[] {11, 12, 13, 14}); + writer.setValueCount(5); + + IntVector dataVector = (IntVector) vector.getDataVector(); + int[] indices = new int[] {0, 2, 4, 6, 10, 14}; + + for (int valueCount = 1; valueCount <= 5; valueCount++) { + int validityBufferSize = BitVectorHelper.getValidityBufferSize(valueCount); + int offsetBufferSize = valueCount * BaseRepeatedValueViewVector.OFFSET_WIDTH; + int sizeBufferSize = valueCount * BaseRepeatedValueViewVector.SIZE_WIDTH; + + int expectedSize = validityBufferSize + offsetBufferSize + sizeBufferSize + + dataVector.getBufferSizeFor(indices[valueCount]); + assertEquals(expectedSize, vector.getBufferSizeFor(valueCount)); + } + vector.validate(); + } + } + + @Test + public void testIsEmpty() { + try (final ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + // set values [1,2], null, [], [5,6] + writeIntValues(writer, new int[] {1, 2}); + writer.setPosition(2); + writeIntValues(writer, new int[] {}); + writeIntValues(writer, new int[] {5, 6}); + writer.setValueCount(4); + + assertFalse(vector.isEmpty(0)); + assertTrue(vector.isNull(1)); + assertTrue(vector.isEmpty(1)); + assertFalse(vector.isNull(2)); + assertTrue(vector.isEmpty(2)); + assertFalse(vector.isEmpty(3)); + + vector.validate(); + } + } + + @Test + public void testTotalCapacity() { + // adopted from ListVector test cases + final FieldType type = FieldType.nullable(MinorType.INT.getType()); + try (final ListViewVector vector = new ListViewVector("listview", allocator, type, null)) { + // Force the child vector to be allocated based on the type + // (this is a bad API: we have to track and repeat the type twice) + vector.addOrGetVector(type); + + // Specify the allocation size but do not allocate + vector.setInitialTotalCapacity(10, 100); + + // Finally, actually do the allocation + vector.allocateNewSafe(); + + // Note: allocator rounds up and can be greater than the requested allocation. + assertTrue(vector.getValueCapacity() >= 10); + assertTrue(vector.getDataVector().getValueCapacity() >= 100); + + vector.validate(); + } + } + + @Test + public void testSetNull1() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.endList(); + + vector.setNull(1); + + writer.setPosition(2); + writer.startList(); + writer.bigInt().writeBigInt(30); + writer.bigInt().writeBigInt(40); + writer.endList(); + + vector.setNull(3); + vector.setNull(4); + + writer.setPosition(5); + writer.startList(); + writer.bigInt().writeBigInt(50); + writer.bigInt().writeBigInt(60); + writer.endList(); + + vector.setValueCount(6); + + assertFalse(vector.isNull(0)); + assertTrue(vector.isNull(1)); + assertFalse(vector.isNull(2)); + assertTrue(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertFalse(vector.isNull(5)); + + // validate buffers + + final ArrowBuf validityBuffer = vector.getValidityBuffer(); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + assertEquals(1, BitVectorHelper.get(validityBuffer, 0)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 1)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 2)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 3)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 4)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 5)); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, offsetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(4, offsetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // validate values + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(10), resultSet.get(0)); + assertEquals(Long.valueOf(20), resultSet.get(1)); + + result = vector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(30), resultSet.get(0)); + assertEquals(Long.valueOf(40), resultSet.get(1)); + + result = vector.getObject(5); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(50), resultSet.get(0)); + assertEquals(Long.valueOf(60), resultSet.get(1)); + + vector.validate(); + } + } + + @Test + public void testSetNull2() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + // validate setting nulls first and then writing values + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + vector.setNull(0); + vector.setNull(2); + vector.setNull(4); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(3); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + writer.setPosition(5); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.bigInt().writeBigInt(80); + writer.endList(); + + vector.setValueCount(6); + + assertTrue(vector.isNull(0)); + assertFalse(vector.isNull(1)); + assertTrue(vector.isNull(2)); + assertFalse(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertFalse(vector.isNull(5)); + + // validate buffers + + final ArrowBuf validityBuffer = vector.getValidityBuffer(); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + assertEquals(0, BitVectorHelper.get(validityBuffer, 0)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 1)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 2)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 3)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 4)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 5)); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offsetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // validate values + + Object result = vector.getObject(1); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(10), resultSet.get(0)); + assertEquals(Long.valueOf(20), resultSet.get(1)); + assertEquals(Long.valueOf(30), resultSet.get(2)); + + result = vector.getObject(3); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(40), resultSet.get(0)); + assertEquals(Long.valueOf(50), resultSet.get(1)); + + result = vector.getObject(5); + resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + assertEquals(Long.valueOf(80), resultSet.get(2)); + + vector.validate(); + } + } + + @Test + public void testSetNull3() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + // validate setting values first and then writing nulls + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(3); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + writer.setPosition(5); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.bigInt().writeBigInt(80); + writer.endList(); + + vector.setNull(0); + vector.setNull(2); + vector.setNull(4); + + vector.setValueCount(6); + + assertTrue(vector.isNull(0)); + assertFalse(vector.isNull(1)); + assertTrue(vector.isNull(2)); + assertFalse(vector.isNull(3)); + assertTrue(vector.isNull(4)); + assertFalse(vector.isNull(5)); + + // validate buffers + + final ArrowBuf validityBuffer = vector.getValidityBuffer(); + final ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + assertEquals(0, BitVectorHelper.get(validityBuffer, 0)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 1)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 2)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 3)); + assertEquals(0, BitVectorHelper.get(validityBuffer, 4)); + assertEquals(1, BitVectorHelper.get(validityBuffer, 5)); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offsetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offsetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(5, offsetBuffer.getInt(5 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(3, sizeBuffer.getInt(5 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // validate values + + Object result = vector.getObject(1); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(10), resultSet.get(0)); + assertEquals(Long.valueOf(20), resultSet.get(1)); + assertEquals(Long.valueOf(30), resultSet.get(2)); + + result = vector.getObject(3); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(40), resultSet.get(0)); + assertEquals(Long.valueOf(50), resultSet.get(1)); + + result = vector.getObject(5); + resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + assertEquals(Long.valueOf(80), resultSet.get(2)); + + vector.validate(); + } + } + + @Test + public void testOverWrite1() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + vector.setValueCount(2); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.endList(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(80); + writer.bigInt().writeBigInt(90); + writer.endList(); + + vector.setValueCount(2); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(80), resultSet.get(0)); + assertEquals(Long.valueOf(90), resultSet.get(1)); + + vector.validate(); + } + } + + @Test + public void testOverwriteWithNull() { + try (ListViewVector vector = ListViewVector.empty("listview", allocator)) { + UnionListViewWriter writer = vector.getWriter(); + writer.allocate(); + + ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + ArrowBuf sizeBuffer = vector.getSizeBuffer(); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(10); + writer.bigInt().writeBigInt(20); + writer.bigInt().writeBigInt(30); + writer.endList(); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(40); + writer.bigInt().writeBigInt(50); + writer.endList(); + + vector.setValueCount(2); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + vector.setNull(0); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + vector.setNull(1); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + assertTrue(vector.isNull(0)); + assertTrue(vector.isNull(1)); + + writer.setPosition(0); + writer.startList(); + writer.bigInt().writeBigInt(60); + writer.bigInt().writeBigInt(70); + writer.endList(); + + assertEquals(0, offsetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + writer.setPosition(1); + writer.startList(); + writer.bigInt().writeBigInt(80); + writer.bigInt().writeBigInt(90); + writer.endList(); + + assertEquals(2, offsetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(2, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + vector.setValueCount(2); + + assertFalse(vector.isNull(0)); + assertFalse(vector.isNull(1)); + + Object result = vector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(60), resultSet.get(0)); + assertEquals(Long.valueOf(70), resultSet.get(1)); + + result = vector.getObject(1); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Long.valueOf(80), resultSet.get(0)); + assertEquals(Long.valueOf(90), resultSet.get(1)); + + vector.validate(); + } + } + + @Test + public void testOutOfOrderOffset1() { + // [[12, -7, 25], null, [0, -127, 127, 50], [], [50, 12]] + try (ListViewVector listViewVector = ListViewVector.empty("listview", allocator)) { + // Allocate buffers in listViewVector by calling `allocateNew` method. + listViewVector.allocateNew(); + + // Initialize the child vector using `initializeChildrenFromFields` method. + + FieldType fieldType = new FieldType(true, new ArrowType.Int(16, true), + null, null); + Field field = new Field("child-vector", fieldType, null); + listViewVector.initializeChildrenFromFields(Collections.singletonList(field)); + + // Set values in the child vector. + FieldVector fieldVector = listViewVector.getDataVector(); + fieldVector.clear(); + + SmallIntVector childVector = (SmallIntVector) fieldVector; + + childVector.allocateNew(7); + + childVector.set(0, 0); + childVector.set(1, -127); + childVector.set(2, 127); + childVector.set(3, 50); + childVector.set(4, 12); + childVector.set(5, -7); + childVector.set(6, 25); + + childVector.setValueCount(7); + + // Set validity, offset and size buffers using `setValidity`, + // `setOffset` and `setSize` methods. + listViewVector.setValidity(0, 1); + listViewVector.setValidity(1, 0); + listViewVector.setValidity(2, 1); + listViewVector.setValidity(3, 1); + listViewVector.setValidity(4, 1); + + listViewVector.setOffset(0, 4); + listViewVector.setOffset(1, 7); + listViewVector.setOffset(2, 0); + listViewVector.setOffset(3, 0); + listViewVector.setOffset(4, 3); + + listViewVector.setSize(0, 3); + listViewVector.setSize(1, 0); + listViewVector.setSize(2, 4); + listViewVector.setSize(3, 0); + listViewVector.setSize(4, 2); + + // Set value count using `setValueCount` method. + listViewVector.setValueCount(5); + + final ArrowBuf offSetBuffer = listViewVector.getOffsetBuffer(); + final ArrowBuf sizeBuffer = listViewVector.getSizeBuffer(); + + // check offset buffer + assertEquals(4, offSetBuffer.getInt(0 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(7, offSetBuffer.getInt(1 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(2 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(0, offSetBuffer.getInt(3 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + assertEquals(3, offSetBuffer.getInt(4 * BaseRepeatedValueViewVector.OFFSET_WIDTH)); + + // check size buffer + assertEquals(3, sizeBuffer.getInt(0 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(1 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(4, sizeBuffer.getInt(2 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(0, sizeBuffer.getInt(3 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + assertEquals(2, sizeBuffer.getInt(4 * BaseRepeatedValueViewVector.SIZE_WIDTH)); + + // check child vector + assertEquals(0, ((SmallIntVector) listViewVector.getDataVector()).get(0)); + assertEquals(-127, ((SmallIntVector) listViewVector.getDataVector()).get(1)); + assertEquals(127, ((SmallIntVector) listViewVector.getDataVector()).get(2)); + assertEquals(50, ((SmallIntVector) listViewVector.getDataVector()).get(3)); + assertEquals(12, ((SmallIntVector) listViewVector.getDataVector()).get(4)); + assertEquals(-7, ((SmallIntVector) listViewVector.getDataVector()).get(5)); + assertEquals(25, ((SmallIntVector) listViewVector.getDataVector()).get(6)); + + // check values + Object result = listViewVector.getObject(0); + ArrayList resultSet = (ArrayList) result; + assertEquals(3, resultSet.size()); + assertEquals(Short.valueOf("12"), resultSet.get(0)); + assertEquals(Short.valueOf("-7"), resultSet.get(1)); + assertEquals(Short.valueOf("25"), resultSet.get(2)); + + assertTrue(listViewVector.isNull(1)); + + result = listViewVector.getObject(2); + resultSet = (ArrayList) result; + assertEquals(4, resultSet.size()); + assertEquals(Short.valueOf("0"), resultSet.get(0)); + assertEquals(Short.valueOf("-127"), resultSet.get(1)); + assertEquals(Short.valueOf("127"), resultSet.get(2)); + assertEquals(Short.valueOf("50"), resultSet.get(3)); + + assertTrue(listViewVector.isEmpty(3)); + + result = listViewVector.getObject(4); + resultSet = (ArrayList) result; + assertEquals(2, resultSet.size()); + assertEquals(Short.valueOf("50"), resultSet.get(0)); + assertEquals(Short.valueOf("12"), resultSet.get(1)); + + listViewVector.validate(); + } + } + + private void writeIntValues(UnionListViewWriter writer, int[] values) { + writer.startList(); + for (int v: values) { + writer.integer().writeInt(v); + } + writer.endList(); + } + +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java index ab8c6c634891e..c3e7ef8bf8b08 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestRangeEqualsVisitor.java @@ -18,8 +18,8 @@ package org.apache.arrow.vector.compare; import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.charset.Charset; import java.util.Arrays; @@ -33,6 +33,7 @@ import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.LargeVarCharVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.compare.util.ValueEpsilonEqualizers; import org.apache.arrow.vector.complex.DenseUnionVector; @@ -53,16 +54,16 @@ import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.FieldType; -import org.junit.After; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; public class TestRangeEqualsVisitor { private BufferAllocator allocator; - @Before + @BeforeEach public void init() { allocator = new RootAllocator(Long.MAX_VALUE); } @@ -71,8 +72,11 @@ public void init() { private static final byte[] STR1 = "AAAAA1".getBytes(utf8Charset); private static final byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); private static final byte[] STR3 = "CCCC3".getBytes(utf8Charset); + private static final byte[] STR4 = "12345678901234A".getBytes(utf8Charset); + private static final byte[] STR5 = "A2345678901234ABC".getBytes(utf8Charset); + private static final byte[] STR6 = "AB45678901234ABCD".getBytes(utf8Charset); - @After + @AfterEach public void terminate() throws Exception { allocator.close(); } @@ -132,6 +136,55 @@ public void testBaseVariableVectorRangeEquals() { } } + @Test + public void testBaseVariableViewVectorRangeEquals() { + try (final ViewVarCharVector vector1 = new ViewVarCharVector("varchar", allocator); + final ViewVarCharVector vector2 = new ViewVarCharVector("varchar", allocator)) { + + setVector(vector1, STR1, STR2, STR4, STR3, STR2, STR5, STR1, STR6, STR1, STR2, STR4); + setVector(vector2, STR1, STR2, STR4, STR3, STR2, STR5, STR1, STR6, STR1, STR2, STR4); + + RangeEqualsVisitor visitor = new RangeEqualsVisitor(vector1, vector2); + // inclusion of long string in the middle + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + assertFalse(visitor.rangeEquals(new Range(0, 1, 4))); + // inclusion of long string at the start + assertTrue(visitor.rangeEquals(new Range(2, 2, 4))); + assertFalse(visitor.rangeEquals(new Range(2, 5, 4))); + // inclusion of long string at the end + assertTrue(visitor.rangeEquals(new Range(4, 4, 4))); + // unequal range + assertTrue(visitor.rangeEquals(new Range(8, 0, 3))); + assertFalse(visitor.rangeEquals(new Range(4, 5, 3))); + + // checking the same ranges when nulls are set + + vector1.setNull(1); + vector2.setNull(1); + + vector1.setNull(3); + vector2.setNull(3); + + vector1.setNull(5); + vector2.setNull(5); + + vector1.setNull(9); + vector2.setNull(9); + + // inclusion of long string in the middle + assertTrue(visitor.rangeEquals(new Range(1, 1, 3))); + assertFalse(visitor.rangeEquals(new Range(0, 1, 4))); + // inclusion of long string at the start + assertTrue(visitor.rangeEquals(new Range(2, 2, 4))); + assertFalse(visitor.rangeEquals(new Range(2, 5, 4))); + // inclusion of long string at the end + assertTrue(visitor.rangeEquals(new Range(4, 4, 4))); + // unequal range + assertTrue(visitor.rangeEquals(new Range(8, 0, 3))); + assertFalse(visitor.rangeEquals(new Range(4, 5, 3))); + } + } + @Test public void testListVectorWithDifferentChild() { try (final ListVector vector1 = ListVector.empty("list", allocator); @@ -476,7 +529,7 @@ public void testDenseUnionVectorEquals() { } } - @Ignore + @Disabled @Test public void testEqualsWithOutTypeCheck() { try (final IntVector intVector = new IntVector("int", allocator); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java index 62fa0336ea925..736b0f1b1aeac 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/compare/TestTypeEqualsVisitor.java @@ -20,6 +20,7 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; @@ -30,6 +31,8 @@ import org.apache.arrow.vector.Float8Vector; import org.apache.arrow.vector.IntVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarBinaryVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.StructVector; @@ -176,4 +179,42 @@ public void testDenseUnionTypeEquals() { assertFalse(typeVisitor.equals(vector1)); } } + + @Test + public void testStringViewTypeEquals() { + try (final ViewVarCharVector varchar1 = new ViewVarCharVector("varchar1", allocator); + final ViewVarCharVector varchar2 = new ViewVarCharVector("varchar2", allocator); + final ViewVarBinaryVector binary = new ViewVarBinaryVector("binary", allocator)) { + final int valueCount = 2; + final byte[] str0 = "apache".getBytes(StandardCharsets.UTF_8); + final byte[] str1 = "arrow".getBytes(StandardCharsets.UTF_8); + + // add elements for varchar1 + varchar1.allocateNew(48, valueCount); + varchar1.set(0, str0); + varchar1.set(1, str1); + varchar1.setValueCount(valueCount); + + // add elements for varchar2 in a difference order + varchar2.allocateNew(48, valueCount); + varchar2.set(0, str1); + varchar2.set(1, str0); + varchar2.setValueCount(valueCount); + + // add elements for binary + binary.allocateNew(48, valueCount); + binary.set(0, str0); + binary.set(1, str1); + binary.setValueCount(valueCount); + + // compare ignore check name + TypeEqualsVisitor visitor = new TypeEqualsVisitor(varchar1, /* check name */ false, /* check meta data */ true); + assertTrue(visitor.equals(varchar2)); + assertFalse(visitor.equals(binary)); + + // if we check names, the types should be different + visitor = new TypeEqualsVisitor(varchar1, /* check name */ true, /* check meta data */ true); + assertFalse(visitor.equals(varchar2)); + } + } } diff --git a/js/package.json b/js/package.json index fee6b342dbd13..7ed0daddfada0 100644 --- a/js/package.json +++ b/js/package.json @@ -67,13 +67,13 @@ "@rollup/plugin-alias": "5.1.0", "@rollup/plugin-node-resolve": "15.2.3", "@rollup/stream": "3.0.1", - "@swc/core": "1.4.14", + "@swc/core": "1.4.17", "@types/benchmark": "2.1.5", "@types/glob": "8.1.0", "@types/jest": "29.5.12", "@types/multistream": "4.1.3", - "@typescript-eslint/eslint-plugin": "7.7.0", - "@typescript-eslint/parser": "7.7.0", + "@typescript-eslint/eslint-plugin": "7.8.0", + "@typescript-eslint/parser": "7.8.0", "async-done": "2.0.0", "benny": "3.7.1", "cross-env": "7.0.3", @@ -82,7 +82,7 @@ "esbuild": "0.20.2", "esbuild-plugin-alias": "0.2.1", "eslint": "8.57.0", - "eslint-plugin-jest": "27.9.0", + "eslint-plugin-jest": "28.4.0", "eslint-plugin-unicorn": "52.0.0", "esm": "https://github.com/jsg2021/esm/releases/download/v3.x.x-pr883/esm-3.x.x-pr883.tgz", "gulp": "4.0.2", @@ -99,11 +99,11 @@ "ix": "5.0.0", "jest": "29.7.0", "jest-silent-reporter": "0.5.0", - "memfs": "4.8.2", + "memfs": "4.9.2", "mkdirp": "3.0.1", "multistream": "4.1.0", "regenerator-runtime": "0.14.1", - "rollup": "4.14.3", + "rollup": "4.17.2", "rxjs": "7.8.1", "ts-jest": "29.1.2", "ts-node": "10.9.2", diff --git a/js/yarn.lock b/js/yarn.lock index b74e4543d9d4e..eb7ed33520f0a 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -936,6 +936,26 @@ "@jridgewell/resolve-uri" "^3.1.0" "@jridgewell/sourcemap-codec" "^1.4.14" +"@jsonjoy.com/base64@^1.1.1": + version "1.1.1" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/base64/-/base64-1.1.1.tgz#a717fd8840f7bad49c7fe66cc65db8bcfc4c4dc5" + integrity sha512-LnFjVChaGY8cZVMwAIMjvA1XwQjZ/zIXHyh28IyJkyNkzof4Dkm1+KN9UIm3lHhREH4vs7XwZ0NpkZKnwOtEfg== + +"@jsonjoy.com/json-pack@^1.0.3": + version "1.0.3" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/json-pack/-/json-pack-1.0.3.tgz#a68cbe3ccfd85d26cd763e4175fe90c9ee383d33" + integrity sha512-Q0SPAdmK6s5Fe3e1kcNvwNyk6e2+CxM8XZdGbf4abZG7nUO05KSie3/iX29loTBuY+75uVP6RixDSPVpotfzmQ== + dependencies: + "@jsonjoy.com/base64" "^1.1.1" + "@jsonjoy.com/util" "^1.1.2" + hyperdyperid "^1.2.0" + thingies "^1.20.0" + +"@jsonjoy.com/util@^1.1.2": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@jsonjoy.com/util/-/util-1.1.2.tgz#5072c27ecdb16d1ed7a2d125a1d0ed8aba01d652" + integrity sha512-HOGa9wtE6LEz2I5mMQ2pMSjth85PmD71kPbsecs02nEUq3/Kw0wRK3gmZn5BCEB8mFLXByqPxjHgApoMwIPMKQ== + "@nodelib/fs.scandir@2.1.5": version "2.1.5" resolved "https://registry.yarnpkg.com/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz#7619c2eb21b25483f6d167548b4cfd5a7488c3d5" @@ -1000,85 +1020,85 @@ estree-walker "^2.0.2" picomatch "^2.3.1" -"@rollup/rollup-android-arm-eabi@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.14.3.tgz#bddf05c3387d02fac04b6b86b3a779337edfed75" - integrity sha512-X9alQ3XM6I9IlSlmC8ddAvMSyG1WuHk5oUnXGw+yUBs3BFoTizmG1La/Gr8fVJvDWAq+zlYTZ9DBgrlKRVY06g== - -"@rollup/rollup-android-arm64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.14.3.tgz#b26bd09de58704c0a45e3375b76796f6eda825e4" - integrity sha512-eQK5JIi+POhFpzk+LnjKIy4Ks+pwJ+NXmPxOCSvOKSNRPONzKuUvWE+P9JxGZVxrtzm6BAYMaL50FFuPe0oWMQ== - -"@rollup/rollup-darwin-arm64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.14.3.tgz#c5f3fd1aa285b6d33dda6e3f3ca395f8c37fd5ca" - integrity sha512-Od4vE6f6CTT53yM1jgcLqNfItTsLt5zE46fdPaEmeFHvPs5SjZYlLpHrSiHEKR1+HdRfxuzXHjDOIxQyC3ptBA== - -"@rollup/rollup-darwin-x64@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.14.3.tgz#8e4673734d7dc9d68f6d48e81246055cda0e840f" - integrity sha512-0IMAO21axJeNIrvS9lSe/PGthc8ZUS+zC53O0VhF5gMxfmcKAP4ESkKOCwEi6u2asUrt4mQv2rjY8QseIEb1aw== - -"@rollup/rollup-linux-arm-gnueabihf@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.14.3.tgz#53ed38eb13b58ababdb55a7f66f0538a7f85dcba" - integrity sha512-ge2DC7tHRHa3caVEoSbPRJpq7azhG+xYsd6u2MEnJ6XzPSzQsTKyXvh6iWjXRf7Rt9ykIUWHtl0Uz3T6yXPpKw== - -"@rollup/rollup-linux-arm-musleabihf@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.14.3.tgz#0706ee38330e267a5c9326956820f009cfb21fcd" - integrity sha512-ljcuiDI4V3ySuc7eSk4lQ9wU8J8r8KrOUvB2U+TtK0TiW6OFDmJ+DdIjjwZHIw9CNxzbmXY39wwpzYuFDwNXuw== - -"@rollup/rollup-linux-arm64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.14.3.tgz#426fce7b8b242ac5abd48a10a5020f5a468c6cb4" - integrity sha512-Eci2us9VTHm1eSyn5/eEpaC7eP/mp5n46gTRB3Aar3BgSvDQGJZuicyq6TsH4HngNBgVqC5sDYxOzTExSU+NjA== - -"@rollup/rollup-linux-arm64-musl@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.14.3.tgz#65bf944530d759b50d7ffd00dfbdf4125a43406f" - integrity sha512-UrBoMLCq4E92/LCqlh+blpqMz5h1tJttPIniwUgOFJyjWI1qrtrDhhpHPuFxULlUmjFHfloWdixtDhSxJt5iKw== - -"@rollup/rollup-linux-powerpc64le-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.14.3.tgz#494ba3b31095e9a45df9c3f646d21400fb631a95" - integrity sha512-5aRjvsS8q1nWN8AoRfrq5+9IflC3P1leMoy4r2WjXyFqf3qcqsxRCfxtZIV58tCxd+Yv7WELPcO9mY9aeQyAmw== - -"@rollup/rollup-linux-riscv64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.14.3.tgz#8b88ed0a40724cce04aa15374ebe5ba4092d679f" - integrity sha512-sk/Qh1j2/RJSX7FhEpJn8n0ndxy/uf0kI/9Zc4b1ELhqULVdTfN6HL31CDaTChiBAOgLcsJ1sgVZjWv8XNEsAQ== - -"@rollup/rollup-linux-s390x-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.14.3.tgz#09c9e5ec57a0f6ec3551272c860bb9a04b96d70f" - integrity sha512-jOO/PEaDitOmY9TgkxF/TQIjXySQe5KVYB57H/8LRP/ux0ZoO8cSHCX17asMSv3ruwslXW/TLBcxyaUzGRHcqg== - -"@rollup/rollup-linux-x64-gnu@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.14.3.tgz#197f27fd481ad9c861021d5cbbf21793922a631c" - integrity sha512-8ybV4Xjy59xLMyWo3GCfEGqtKV5M5gCSrZlxkPGvEPCGDLNla7v48S662HSGwRd6/2cSneMQWiv+QzcttLrrOA== - -"@rollup/rollup-linux-x64-musl@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.14.3.tgz#5cc0522f4942f2df625e9bfb6fb02c6580ffbce6" - integrity sha512-s+xf1I46trOY10OqAtZ5Rm6lzHre/UiLA1J2uOhCFXWkbZrJRkYBPO6FhvGfHmdtQ3Bx793MNa7LvoWFAm93bg== - -"@rollup/rollup-win32-arm64-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.14.3.tgz#a648122389d23a7543b261fba082e65fefefe4f6" - integrity sha512-+4h2WrGOYsOumDQ5S2sYNyhVfrue+9tc9XcLWLh+Kw3UOxAvrfOrSMFon60KspcDdytkNDh7K2Vs6eMaYImAZg== - -"@rollup/rollup-win32-ia32-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.14.3.tgz#34727b5c7953c35fc6e1ae4f770ad3a2025f8e03" - integrity sha512-T1l7y/bCeL/kUwh9OD4PQT4aM7Bq43vX05htPJJ46RTI4r5KNt6qJRzAfNfM+OYMNEVBWQzR2Gyk+FXLZfogGw== - -"@rollup/rollup-win32-x64-msvc@4.14.3": - version "4.14.3" - resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.14.3.tgz#5b2fb4d8cd44c05deef8a7b0e6deb9ccb8939d18" - integrity sha512-/BypzV0H1y1HzgYpxqRaXGBRqfodgoBBCcsrujT6QRcakDQdfU+Lq9PENPh5jB4I44YWq+0C2eHsHya+nZY1sA== +"@rollup/rollup-android-arm-eabi@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.17.2.tgz#1a32112822660ee104c5dd3a7c595e26100d4c2d" + integrity sha512-NM0jFxY8bB8QLkoKxIQeObCaDlJKewVlIEkuyYKm5An1tdVZ966w2+MPQ2l8LBZLjR+SgyV+nRkTIunzOYBMLQ== + +"@rollup/rollup-android-arm64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.17.2.tgz#5aeef206d65ff4db423f3a93f71af91b28662c5b" + integrity sha512-yeX/Usk7daNIVwkq2uGoq2BYJKZY1JfyLTaHO/jaiSwi/lsf8fTFoQW/n6IdAsx5tx+iotu2zCJwz8MxI6D/Bw== + +"@rollup/rollup-darwin-arm64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.17.2.tgz#6b66aaf003c70454c292cd5f0236ebdc6ffbdf1a" + integrity sha512-kcMLpE6uCwls023+kknm71ug7MZOrtXo+y5p/tsg6jltpDtgQY1Eq5sGfHcQfb+lfuKwhBmEURDga9N0ol4YPw== + +"@rollup/rollup-darwin-x64@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.17.2.tgz#f64fc51ed12b19f883131ccbcea59fc68cbd6c0b" + integrity sha512-AtKwD0VEx0zWkL0ZjixEkp5tbNLzX+FCqGG1SvOu993HnSz4qDI6S4kGzubrEJAljpVkhRSlg5bzpV//E6ysTQ== + +"@rollup/rollup-linux-arm-gnueabihf@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.17.2.tgz#1a7641111be67c10111f7122d1e375d1226cbf14" + integrity sha512-3reX2fUHqN7sffBNqmEyMQVj/CKhIHZd4y631duy0hZqI8Qoqf6lTtmAKvJFYa6bhU95B1D0WgzHkmTg33In0A== + +"@rollup/rollup-linux-arm-musleabihf@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.17.2.tgz#c93fd632923e0fee25aacd2ae414288d0b7455bb" + integrity sha512-uSqpsp91mheRgw96xtyAGP9FW5ChctTFEoXP0r5FAzj/3ZRv3Uxjtc7taRQSaQM/q85KEKjKsZuiZM3GyUivRg== + +"@rollup/rollup-linux-arm64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.17.2.tgz#fa531425dd21d058a630947527b4612d9d0b4a4a" + integrity sha512-EMMPHkiCRtE8Wdk3Qhtciq6BndLtstqZIroHiiGzB3C5LDJmIZcSzVtLRbwuXuUft1Cnv+9fxuDtDxz3k3EW2A== + +"@rollup/rollup-linux-arm64-musl@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.17.2.tgz#8acc16f095ceea5854caf7b07e73f7d1802ac5af" + integrity sha512-NMPylUUZ1i0z/xJUIx6VUhISZDRT+uTWpBcjdv0/zkp7b/bQDF+NfnfdzuTiB1G6HTodgoFa93hp0O1xl+/UbA== + +"@rollup/rollup-linux-powerpc64le-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.17.2.tgz#94e69a8499b5cf368911b83a44bb230782aeb571" + integrity sha512-T19My13y8uYXPw/L/k0JYaX1fJKFT/PWdXiHr8mTbXWxjVF1t+8Xl31DgBBvEKclw+1b00Chg0hxE2O7bTG7GQ== + +"@rollup/rollup-linux-riscv64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.17.2.tgz#7ef1c781c7e59e85a6ce261cc95d7f1e0b56db0f" + integrity sha512-BOaNfthf3X3fOWAB+IJ9kxTgPmMqPPH5f5k2DcCsRrBIbWnaJCgX2ll77dV1TdSy9SaXTR5iDXRL8n7AnoP5cg== + +"@rollup/rollup-linux-s390x-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.17.2.tgz#f15775841c3232fca9b78cd25a7a0512c694b354" + integrity sha512-W0UP/x7bnn3xN2eYMql2T/+wpASLE5SjObXILTMPUBDB/Fg/FxC+gX4nvCfPBCbNhz51C+HcqQp2qQ4u25ok6g== + +"@rollup/rollup-linux-x64-gnu@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.17.2.tgz#b521d271798d037ad70c9f85dd97d25f8a52e811" + integrity sha512-Hy7pLwByUOuyaFC6mAr7m+oMC+V7qyifzs/nW2OJfC8H4hbCzOX07Ov0VFk/zP3kBsELWNFi7rJtgbKYsav9QQ== + +"@rollup/rollup-linux-x64-musl@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.17.2.tgz#9254019cc4baac35800991315d133cc9fd1bf385" + integrity sha512-h1+yTWeYbRdAyJ/jMiVw0l6fOOm/0D1vNLui9iPuqgRGnXA0u21gAqOyB5iHjlM9MMfNOm9RHCQ7zLIzT0x11Q== + +"@rollup/rollup-win32-arm64-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.17.2.tgz#27f65a89f6f52ee9426ec11e3571038e4671790f" + integrity sha512-tmdtXMfKAjy5+IQsVtDiCfqbynAQE/TQRpWdVataHmhMb9DCoJxp9vLcCBjEQWMiUYxO1QprH/HbY9ragCEFLA== + +"@rollup/rollup-win32-ia32-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.17.2.tgz#a2fbf8246ed0bb014f078ca34ae6b377a90cb411" + integrity sha512-7II/QCSTAHuE5vdZaQEwJq2ZACkBpQDOmQsE6D6XUbnBHW8IAhm4eTufL6msLJorzrHDFv3CF8oCA/hSIRuZeQ== + +"@rollup/rollup-win32-x64-msvc@4.17.2": + version "4.17.2" + resolved "https://registry.yarnpkg.com/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.17.2.tgz#5a2d08b81e8064b34242d5cc9973ef8dd1e60503" + integrity sha512-TGGO7v7qOq4CYmSBVEYpI1Y5xDuCEnbVC5Vth8mOsW0gDSzxNrVERPc790IGHsrT2dQSimgMr9Ub3Y1Jci5/8w== "@rollup/stream@3.0.1": version "3.0.1" @@ -1104,74 +1124,74 @@ dependencies: "@sinonjs/commons" "^3.0.0" -"@swc/core-darwin-arm64@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-darwin-arm64/-/core-darwin-arm64-1.4.14.tgz#de570252c3f155f55536f0d6bb8bafaec2e99616" - integrity sha512-8iPfLhYNspBl836YYsfv6ErXwDUqJ7IMieddV3Ey/t/97JAEAdNDUdtTKDtbyP0j/Ebyqyn+fKcqwSq7rAof0g== - -"@swc/core-darwin-x64@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-darwin-x64/-/core-darwin-x64-1.4.14.tgz#4eefbe129e416f4c400656742ab7f30e01aff02e" - integrity sha512-9CqSj8uRZ92cnlgAlVaWMaJJBdxtNvCzJxaGj5KuIseeG6Q0l1g+qk8JcU7h9dAsH9saHTNwNFBVGKQo0W0ujg== - -"@swc/core-linux-arm-gnueabihf@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.4.14.tgz#bea4b94c32bb25de2816126dac299655529ba7f3" - integrity sha512-mfd5JArPITTzMjcezH4DwMw+BdjBV1y25Khp8itEIpdih9ei+fvxOOrDYTN08b466NuE2dF2XuhKtRLA7fXArQ== - -"@swc/core-linux-arm64-gnu@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.4.14.tgz#52063214f4a14d6a0c3c6059ed9e7ba1062f6b46" - integrity sha512-3Lqlhlmy8MVRS9xTShMaPAp0oyUt0KFhDs4ixJsjdxKecE0NJSV/MInuDmrkij1C8/RQ2wySRlV9np5jK86oWw== - -"@swc/core-linux-arm64-musl@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.4.14.tgz#7e7deea7b1b3d0c9944cc8e9ba948fcc785158ea" - integrity sha512-n0YoCa64TUcJrbcXIHIHDWQjdUPdaXeMHNEu7yyBtOpm01oMGTKP3frsUXIABLBmAVWtKvqit4/W1KVKn5gJzg== - -"@swc/core-linux-x64-gnu@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.4.14.tgz#301133ea3ee347568886f2489837e991e96d44db" - integrity sha512-CGmlwLWbfG1dB4jZBJnp2IWlK5xBMNLjN7AR5kKA3sEpionoccEnChOEvfux1UdVJQjLRKuHNV9yGyqGBTpxfQ== - -"@swc/core-linux-x64-musl@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.4.14.tgz#86b8e987a814209cd0dd0f21cbc1134305dfffd5" - integrity sha512-xq4npk8YKYmNwmr8fbvF2KP3kUVdZYfXZMQnW425gP3/sn+yFQO8Nd0bGH40vOVQn41kEesSe0Z5O/JDor2TgQ== - -"@swc/core-win32-arm64-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.4.14.tgz#eb56b8977e3542665929c3963bd7dc18fe5b2556" - integrity sha512-imq0X+gU9uUe6FqzOQot5gpKoaC00aCUiN58NOzwp0QXEupn8CDuZpdBN93HiZswfLruu5jA1tsc15x6v9p0Yg== - -"@swc/core-win32-ia32-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.4.14.tgz#72e119038b9d8743b13bb933b8e192acd9f501f9" - integrity sha512-cH6QpXMw5D3t+lpx6SkErHrxN0yFzmQ0lgNAJxoDRiaAdDbqA6Col8UqUJwUS++Ul6aCWgNhCdiEYehPaoyDPA== - -"@swc/core-win32-x64-msvc@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.4.14.tgz#f5a3b1a241708b0628a07458e5bedbf67a1b9595" - integrity sha512-FmZ4Tby4wW65K/36BKzmuu7mlq7cW5XOxzvufaSNVvQ5PN4OodAlqPjToe029oma4Av+ykJiif64scMttyNAzg== - -"@swc/core@1.4.14": - version "1.4.14" - resolved "https://registry.yarnpkg.com/@swc/core/-/core-1.4.14.tgz#8bad316c0119f626bb1b181ba7a988ef9d14e9cc" - integrity sha512-tHXg6OxboUsqa/L7DpsCcFnxhLkqN/ht5pCwav1HnvfthbiNIJypr86rNx4cUnQDJepETviSqBTIjxa7pSpGDQ== +"@swc/core-darwin-arm64@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-darwin-arm64/-/core-darwin-arm64-1.4.17.tgz#e62fa7f247bdd1c0c50a3f99722da4dd098c7c67" + integrity sha512-HVl+W4LezoqHBAYg2JCqR+s9ife9yPfgWSj37iIawLWzOmuuJ7jVdIB7Ee2B75bEisSEKyxRlTl6Y1Oq3owBgw== + +"@swc/core-darwin-x64@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-darwin-x64/-/core-darwin-x64-1.4.17.tgz#1145cbb7575e317204ed3a7d0274bd26fe9ffab6" + integrity sha512-WYRO9Fdzq4S/he8zjW5I95G1zcvyd9yyD3Tgi4/ic84P5XDlSMpBDpBLbr/dCPjmSg7aUXxNQqKqGkl6dQxYlA== + +"@swc/core-linux-arm-gnueabihf@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm-gnueabihf/-/core-linux-arm-gnueabihf-1.4.17.tgz#7145b3ada5cf9b748eaacbc9a7c7037ba0fb26bb" + integrity sha512-cgbvpWOvtMH0XFjvwppUCR+Y+nf6QPaGu6AQ5hqCP+5Lv2zO5PG0RfasC4zBIjF53xgwEaaWmGP5/361P30X8Q== + +"@swc/core-linux-arm64-gnu@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-gnu/-/core-linux-arm64-gnu-1.4.17.tgz#5c0833ef132af17bd3cbdf2253f35b57c0cf62bb" + integrity sha512-l7zHgaIY24cF9dyQ/FOWbmZDsEj2a9gRFbmgx2u19e3FzOPuOnaopFj0fRYXXKCmtdx+anD750iBIYnTR+pq/Q== + +"@swc/core-linux-arm64-musl@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-arm64-musl/-/core-linux-arm64-musl-1.4.17.tgz#5bfe81eb23c905f04b669a7d2b060a147a263483" + integrity sha512-qhH4gr9gAlVk8MBtzXbzTP3BJyqbAfUOATGkyUtohh85fPXQYuzVlbExix3FZXTwFHNidGHY8C+ocscI7uDaYw== + +"@swc/core-linux-x64-gnu@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-gnu/-/core-linux-x64-gnu-1.4.17.tgz#a0c19bc9635e86ebd1c7f8e9e026503d1a1bf83d" + integrity sha512-vRDFATL1oN5oZMImkwbgSHEkp8xG1ofEASBypze01W1Tqto8t+yo6gsp69wzCZBlxldsvPpvFZW55Jq0Rn+UnA== + +"@swc/core-linux-x64-musl@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-linux-x64-musl/-/core-linux-x64-musl-1.4.17.tgz#2179b9536235a3b02a46997ddb1c178dfadf1667" + integrity sha512-zQNPXAXn3nmPqv54JVEN8k2JMEcMTQ6veVuU0p5O+A7KscJq+AGle/7ZQXzpXSfUCXlLMX4wvd+rwfGhh3J4cw== + +"@swc/core-win32-arm64-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-arm64-msvc/-/core-win32-arm64-msvc-1.4.17.tgz#3004a431c836c6b16b4660ea2425dde467a8ee36" + integrity sha512-z86n7EhOwyzxwm+DLE5NoLkxCTme2lq7QZlDjbQyfCxOt6isWz8rkW5QowTX8w9Rdmk34ncrjSLvnHOeLY17+w== + +"@swc/core-win32-ia32-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-ia32-msvc/-/core-win32-ia32-msvc-1.4.17.tgz#59155485d5307fb2a267e5acb215e0f440b6f48f" + integrity sha512-JBwuSTJIgiJJX6wtr4wmXbfvOswHFj223AumUrK544QV69k60FJ9q2adPW9Csk+a8wm1hLxq4HKa2K334UHJ/g== + +"@swc/core-win32-x64-msvc@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core-win32-x64-msvc/-/core-win32-x64-msvc-1.4.17.tgz#b98f25fc277fb0e319f25f9fd00a82023662716b" + integrity sha512-jFkOnGQamtVDBm3MF5Kq1lgW8vx4Rm1UvJWRUfg+0gx7Uc3Jp3QMFeMNw/rDNQYRDYPG3yunCC+2463ycd5+dg== + +"@swc/core@1.4.17": + version "1.4.17" + resolved "https://registry.yarnpkg.com/@swc/core/-/core-1.4.17.tgz#3ea4180fa5c54282b284006a6de1263ef1cf887f" + integrity sha512-tq+mdWvodMBNBBZbwFIMTVGYHe9N7zvEaycVVjfvAx20k1XozHbHhRv+9pEVFJjwRxLdXmtvFZd3QZHRAOpoNQ== dependencies: "@swc/counter" "^0.1.2" "@swc/types" "^0.1.5" optionalDependencies: - "@swc/core-darwin-arm64" "1.4.14" - "@swc/core-darwin-x64" "1.4.14" - "@swc/core-linux-arm-gnueabihf" "1.4.14" - "@swc/core-linux-arm64-gnu" "1.4.14" - "@swc/core-linux-arm64-musl" "1.4.14" - "@swc/core-linux-x64-gnu" "1.4.14" - "@swc/core-linux-x64-musl" "1.4.14" - "@swc/core-win32-arm64-msvc" "1.4.14" - "@swc/core-win32-ia32-msvc" "1.4.14" - "@swc/core-win32-x64-msvc" "1.4.14" + "@swc/core-darwin-arm64" "1.4.17" + "@swc/core-darwin-x64" "1.4.17" + "@swc/core-linux-arm-gnueabihf" "1.4.17" + "@swc/core-linux-arm64-gnu" "1.4.17" + "@swc/core-linux-arm64-musl" "1.4.17" + "@swc/core-linux-x64-gnu" "1.4.17" + "@swc/core-linux-x64-musl" "1.4.17" + "@swc/core-win32-arm64-msvc" "1.4.17" + "@swc/core-win32-ia32-msvc" "1.4.17" + "@swc/core-win32-x64-msvc" "1.4.17" "@swc/counter@^0.1.2", "@swc/counter@^0.1.3": version "0.1.3" @@ -1179,9 +1199,9 @@ integrity sha512-e2BR4lsJkkRlKZ/qCHPw9ZaSxc0MVUd7gtbtaB7aMvHeJVYe8sOB8DBZkP2DtISHGSku9sCK6T6cnY0CtXrOCQ== "@swc/helpers@^0.5.10": - version "0.5.10" - resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.10.tgz#5720082d007197cd85743dd599198097126a3f6e" - integrity sha512-CU+RF9FySljn7HVSkkjiB84hWkvTaI3rtLvF433+jRSBL2hMu3zX5bGhHS8C80SM++h4xy8hBSnUHFQHmRXSBw== + version "0.5.11" + resolved "https://registry.yarnpkg.com/@swc/helpers/-/helpers-0.5.11.tgz#5bab8c660a6e23c13b2d23fcd1ee44a2db1b0cb7" + integrity sha512-YNlnKRWF2sVojTpIyzwou9XoTNbzbzONwRhOoniEioF1AtaitTvVZblaQRrAzChWQ1bLYyYSWzM18y4WwgzJ+A== dependencies: tslib "^2.4.0" @@ -1328,7 +1348,7 @@ expect "^29.0.0" pretty-format "^29.0.0" -"@types/json-schema@*", "@types/json-schema@^7.0.15", "@types/json-schema@^7.0.8", "@types/json-schema@^7.0.9": +"@types/json-schema@*", "@types/json-schema@^7.0.12", "@types/json-schema@^7.0.15", "@types/json-schema@^7.0.8": version "7.0.15" resolved "https://registry.yarnpkg.com/@types/json-schema/-/json-schema-7.0.15.tgz#596a1747233694d50f6ad8a7869fcb6f56cf5841" integrity sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA== @@ -1350,10 +1370,10 @@ dependencies: "@types/node" "*" -"@types/node@*": - version "20.12.3" - resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.3.tgz#d6658c2c7776c1cad93534bb45428195ed840c65" - integrity sha512-sD+ia2ubTeWrOu+YMF+MTAB7E+O7qsMqAbMfW7DG3K1URwhZ5hN1pLlRVGbf4wDFzSfikL05M17EyorS86jShw== +"@types/node@*", "@types/node@^20.12.7": + version "20.12.8" + resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.8.tgz#35897bf2bfe3469847ab04634636de09552e8256" + integrity sha512-NU0rJLJnshZWdE/097cdCBbyW1h4hEg0xpovcoAQYHl8dnEyp/NAOiE45pvc+Bd1Dt+2r94v2eGFpQJ4R7g+2w== dependencies: undici-types "~5.26.4" @@ -1362,13 +1382,6 @@ resolved "https://registry.yarnpkg.com/@types/node/-/node-13.13.52.tgz#03c13be70b9031baaed79481c0c0cfb0045e53f7" integrity sha512-s3nugnZumCC//n4moGGe6tkNMyYEdaDBitVjwPxXmR5lnMG5dHePinH2EdxkG3Rh1ghFHHixAG4NJhpJW1rthQ== -"@types/node@^20.12.7": - version "20.12.7" - resolved "https://registry.yarnpkg.com/@types/node/-/node-20.12.7.tgz#04080362fa3dd6c5822061aa3124f5c152cff384" - integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg== - dependencies: - undici-types "~5.26.4" - "@types/normalize-package-data@^2.4.0": version "2.4.4" resolved "https://registry.yarnpkg.com/@types/normalize-package-data/-/normalize-package-data-2.4.4.tgz#56e2cc26c397c038fab0e3a917a12d5c5909e901" @@ -1379,7 +1392,7 @@ resolved "https://registry.yarnpkg.com/@types/resolve/-/resolve-1.20.2.tgz#97d26e00cd4a0423b4af620abecf3e6f442b7975" integrity sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q== -"@types/semver@^7.3.12", "@types/semver@^7.5.8": +"@types/semver@^7.5.0", "@types/semver@^7.5.8": version "7.5.8" resolved "https://registry.yarnpkg.com/@types/semver/-/semver-7.5.8.tgz#8268a8c57a3e4abd25c165ecd36237db7948a55e" integrity sha512-I8EUhyrgfLrcTkzV3TSsGyl1tSuPrEDzr0yd5m90UgNxQkyDXULk3b6MlQqTCpZpNtWe1K0hzclnZkTcLBe2UQ== @@ -1416,16 +1429,16 @@ dependencies: "@types/yargs-parser" "*" -"@typescript-eslint/eslint-plugin@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.7.0.tgz#bf34a02f221811505b8bf2f31060c8560c1bb0a3" - integrity sha512-GJWR0YnfrKnsRoluVO3PRb9r5aMZriiMMM/RHj5nnTrBy1/wIgk76XCtCKcnXGjpZQJQRFtGV9/0JJ6n30uwpQ== +"@typescript-eslint/eslint-plugin@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-7.8.0.tgz#c78e309fe967cb4de05b85cdc876fb95f8e01b6f" + integrity sha512-gFTT+ezJmkwutUPmB0skOj3GZJtlEGnlssems4AjkVweUPGj7jRwwqg0Hhg7++kPGJqKtTYx+R05Ftww372aIg== dependencies: "@eslint-community/regexpp" "^4.10.0" - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/type-utils" "7.7.0" - "@typescript-eslint/utils" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/type-utils" "7.8.0" + "@typescript-eslint/utils" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" graphemer "^1.4.0" ignore "^5.3.1" @@ -1433,73 +1446,74 @@ semver "^7.6.0" ts-api-utils "^1.3.0" -"@typescript-eslint/parser@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/parser/-/parser-7.7.0.tgz#6b1b3ce76c5de002c43af8ae933613b0f2b4bcc6" - integrity sha512-fNcDm3wSwVM8QYL4HKVBggdIPAy9Q41vcvC/GtDobw3c4ndVT3K6cqudUmjHPw8EAp4ufax0o58/xvWaP2FmTg== +"@typescript-eslint/parser@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/parser/-/parser-7.8.0.tgz#1e1db30c8ab832caffee5f37e677dbcb9357ddc8" + integrity sha512-KgKQly1pv0l4ltcftP59uQZCi4HUYswCLbTqVZEJu7uLX8CTLyswqMLqLN+2QFz4jCptqWVV4SB7vdxcH2+0kQ== dependencies: - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/typescript-estree" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/typescript-estree" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" -"@typescript-eslint/scope-manager@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-5.62.0.tgz#d9457ccc6a0b8d6b37d0eb252a23022478c5460c" - integrity sha512-VXuvVvZeQCQb5Zgf4HAxc04q5j+WrNAtNh9OwCsCgpKqESMTu3tF/jhZ3xG6T4NZwWl65Bg8KuS2uEvhSfLl0w== +"@typescript-eslint/scope-manager@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-6.21.0.tgz#ea8a9bfc8f1504a6ac5d59a6df308d3a0630a2b1" + integrity sha512-OwLUIWZJry80O99zvqXVEioyniJMa+d2GrqpUTqi5/v5D5rOrppJVBPa0yKCblcigC0/aYAzxxqQ1B+DS2RYsg== dependencies: - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/visitor-keys" "5.62.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/visitor-keys" "6.21.0" -"@typescript-eslint/scope-manager@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.7.0.tgz#3f0db079b275bb8b0cb5be7613fb3130cfb5de77" - integrity sha512-/8INDn0YLInbe9Wt7dK4cXLDYp0fNHP5xKLHvZl3mOT5X17rK/YShXaiNmorl+/U4VKCVIjJnx4Ri5b0y+HClw== +"@typescript-eslint/scope-manager@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/scope-manager/-/scope-manager-7.8.0.tgz#bb19096d11ec6b87fb6640d921df19b813e02047" + integrity sha512-viEmZ1LmwsGcnr85gIq+FCYI7nO90DVbE37/ll51hjv9aG+YZMb4WDE2fyWpUR4O/UrhGRpYXK/XajcGTk2B8g== dependencies: - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" -"@typescript-eslint/type-utils@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.7.0.tgz#36792ff4209a781b058de61631a48df17bdefbc5" - integrity sha512-bOp3ejoRYrhAlnT/bozNQi3nio9tIgv3U5C0mVDdZC7cpcQEDZXvq8inrHYghLVwuNABRqrMW5tzAv88Vy77Sg== +"@typescript-eslint/type-utils@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/type-utils/-/type-utils-7.8.0.tgz#9de166f182a6e4d1c5da76e94880e91831e3e26f" + integrity sha512-H70R3AefQDQpz9mGv13Uhi121FNMh+WEaRqcXTX09YEDky21km4dV1ZXJIp8QjXc4ZaVkXVdohvWDzbnbHDS+A== dependencies: - "@typescript-eslint/typescript-estree" "7.7.0" - "@typescript-eslint/utils" "7.7.0" + "@typescript-eslint/typescript-estree" "7.8.0" + "@typescript-eslint/utils" "7.8.0" debug "^4.3.4" ts-api-utils "^1.3.0" -"@typescript-eslint/types@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-5.62.0.tgz#258607e60effa309f067608931c3df6fed41fd2f" - integrity sha512-87NVngcbVXUahrRTqIK27gD2t5Cu1yuCXxbLcFtCzZGlfyVWWh8mLHkoxzjsB6DDNnvdL+fW8MiwPEJyGJQDgQ== +"@typescript-eslint/types@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-6.21.0.tgz#205724c5123a8fef7ecd195075fa6e85bac3436d" + integrity sha512-1kFmZ1rOm5epu9NZEZm1kckCDGj5UJEf7P1kliH4LKu/RkwpsfqqGmY2OOcUs18lSlQBKLDYBOGxRVtrMN5lpg== -"@typescript-eslint/types@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.7.0.tgz#23af4d24bf9ce15d8d301236e3e3014143604f27" - integrity sha512-G01YPZ1Bd2hn+KPpIbrAhEWOn5lQBrjxkzHkWvP6NucMXFtfXoevK82hzQdpfuQYuhkvFDeQYbzXCjR1z9Z03w== +"@typescript-eslint/types@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/types/-/types-7.8.0.tgz#1fd2577b3ad883b769546e2d1ef379f929a7091d" + integrity sha512-wf0peJ+ZGlcH+2ZS23aJbOv+ztjeeP8uQ9GgwMJGVLx/Nj9CJt17GWgWWoSmoRVKAX2X+7fzEnAjxdvK2gqCLw== -"@typescript-eslint/typescript-estree@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-5.62.0.tgz#7d17794b77fabcac615d6a48fb143330d962eb9b" - integrity sha512-CmcQ6uY7b9y694lKdRB8FEel7JbU/40iSAPomu++SjLMntB+2Leay2LO6i8VnJk58MtE9/nQSFIH6jpyRWyYzA== +"@typescript-eslint/typescript-estree@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-6.21.0.tgz#c47ae7901db3b8bddc3ecd73daff2d0895688c46" + integrity sha512-6npJTkZcO+y2/kr+z0hc4HwNfrrP4kNYh57ek7yCNlrBjWQ1Y0OS7jiZTkgumrvkX5HkEKXFZkkdFNkaW2wmUQ== dependencies: - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/visitor-keys" "5.62.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/visitor-keys" "6.21.0" debug "^4.3.4" globby "^11.1.0" is-glob "^4.0.3" - semver "^7.3.7" - tsutils "^3.21.0" + minimatch "9.0.3" + semver "^7.5.4" + ts-api-utils "^1.0.1" -"@typescript-eslint/typescript-estree@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.7.0.tgz#b5dd6383b4c6a852d7b256a37af971e8982be97f" - integrity sha512-8p71HQPE6CbxIBy2kWHqM1KGrC07pk6RJn40n0DSc6bMOBBREZxSDJ+BmRzc8B5OdaMh1ty3mkuWRg4sCFiDQQ== +"@typescript-eslint/typescript-estree@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/typescript-estree/-/typescript-estree-7.8.0.tgz#b028a9226860b66e623c1ee55cc2464b95d2987c" + integrity sha512-5pfUCOwK5yjPaJQNy44prjCwtr981dO8Qo9J9PwYXZ0MosgAbfEMB008dJ5sNo3+/BN6ytBPuSvXUg9SAqB0dg== dependencies: - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/visitor-keys" "7.7.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/visitor-keys" "7.8.0" debug "^4.3.4" globby "^11.1.0" is-glob "^4.0.3" @@ -1507,47 +1521,46 @@ semver "^7.6.0" ts-api-utils "^1.3.0" -"@typescript-eslint/utils@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.7.0.tgz#3d2b6606a60ac34f3c625facfb3b3ab7e126f58d" - integrity sha512-LKGAXMPQs8U/zMRFXDZOzmMKgFv3COlxUQ+2NMPhbqgVm6R1w+nU1i4836Pmxu9jZAuIeyySNrN/6Rc657ggig== +"@typescript-eslint/utils@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-7.8.0.tgz#57a79f9c0c0740ead2f622e444cfaeeb9fd047cd" + integrity sha512-L0yFqOCflVqXxiZyXrDr80lnahQfSOfc9ELAAZ75sqicqp2i36kEZZGuUymHNFoYOqxRT05up760b4iGsl02nQ== dependencies: "@eslint-community/eslint-utils" "^4.4.0" "@types/json-schema" "^7.0.15" "@types/semver" "^7.5.8" - "@typescript-eslint/scope-manager" "7.7.0" - "@typescript-eslint/types" "7.7.0" - "@typescript-eslint/typescript-estree" "7.7.0" + "@typescript-eslint/scope-manager" "7.8.0" + "@typescript-eslint/types" "7.8.0" + "@typescript-eslint/typescript-estree" "7.8.0" semver "^7.6.0" -"@typescript-eslint/utils@^5.10.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-5.62.0.tgz#141e809c71636e4a75daa39faed2fb5f4b10df86" - integrity sha512-n8oxjeb5aIbPFEtmQxQYOLI0i9n5ySBEY/ZEHHZqKQSFnxio1rv6dthascc9dLuwrL0RC5mPCxB7vnAVGAYWAQ== +"@typescript-eslint/utils@^6.0.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/utils/-/utils-6.21.0.tgz#4714e7a6b39e773c1c8e97ec587f520840cd8134" + integrity sha512-NfWVaC8HP9T8cbKQxHcsJBY5YE1O33+jpMwN45qzWWaPDZgLIbo12toGMWnmhvCpd3sIxkpDw3Wv1B3dYrbDQQ== dependencies: - "@eslint-community/eslint-utils" "^4.2.0" - "@types/json-schema" "^7.0.9" - "@types/semver" "^7.3.12" - "@typescript-eslint/scope-manager" "5.62.0" - "@typescript-eslint/types" "5.62.0" - "@typescript-eslint/typescript-estree" "5.62.0" - eslint-scope "^5.1.1" - semver "^7.3.7" - -"@typescript-eslint/visitor-keys@5.62.0": - version "5.62.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-5.62.0.tgz#2174011917ce582875954ffe2f6912d5931e353e" - integrity sha512-07ny+LHRzQXepkGg6w0mFY41fVUNBrL2Roj/++7V1txKugfjm/Ci/qSND03r2RhlJhJYMcTn9AhhSSqQp0Ysyw== - dependencies: - "@typescript-eslint/types" "5.62.0" - eslint-visitor-keys "^3.3.0" + "@eslint-community/eslint-utils" "^4.4.0" + "@types/json-schema" "^7.0.12" + "@types/semver" "^7.5.0" + "@typescript-eslint/scope-manager" "6.21.0" + "@typescript-eslint/types" "6.21.0" + "@typescript-eslint/typescript-estree" "6.21.0" + semver "^7.5.4" + +"@typescript-eslint/visitor-keys@6.21.0": + version "6.21.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-6.21.0.tgz#87a99d077aa507e20e238b11d56cc26ade45fe47" + integrity sha512-JJtkDduxLi9bivAB+cYOVMtbkqdPOhZ+ZI5LC47MIRrDV4Yn2o+ZnW10Nkmr28xRpSpdJ6Sm42Hjf2+REYXm0A== + dependencies: + "@typescript-eslint/types" "6.21.0" + eslint-visitor-keys "^3.4.1" -"@typescript-eslint/visitor-keys@7.7.0": - version "7.7.0" - resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.7.0.tgz#950148cf1ac11562a2d903fdf7acf76714a2dc9e" - integrity sha512-h0WHOj8MhdhY8YWkzIF30R379y0NqyOHExI9N9KCzvmu05EgG4FumeYa3ccfKUSphyWkWQE1ybVrgz/Pbam6YA== +"@typescript-eslint/visitor-keys@7.8.0": + version "7.8.0" + resolved "https://registry.yarnpkg.com/@typescript-eslint/visitor-keys/-/visitor-keys-7.8.0.tgz#7285aab991da8bee411a42edbd5db760d22fdd91" + integrity sha512-q4/gibTNBQNA0lGyYQCmWRS5D15n8rXh4QjK3KV+MBPlTYHpfBUT3D3PaPR/HeNiI9W6R7FvlkcGhNyAoP+caA== dependencies: - "@typescript-eslint/types" "7.7.0" + "@typescript-eslint/types" "7.8.0" eslint-visitor-keys "^3.4.3" "@ungap/structured-clone@^1.2.0": @@ -3030,12 +3043,12 @@ escape-string-regexp@^4.0.0: resolved "https://registry.yarnpkg.com/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz#14ba83a5d373e3d311e5afca29cf5bfad965bf34" integrity sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA== -eslint-plugin-jest@27.9.0: - version "27.9.0" - resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-27.9.0.tgz#7c98a33605e1d8b8442ace092b60e9919730000b" - integrity sha512-QIT7FH7fNmd9n4se7FFKHbsLKGQiw885Ds6Y/sxKgCZ6natwCsXdgPOADnYVxN2QrRweF0FZWbJ6S7Rsn7llug== +eslint-plugin-jest@28.4.0: + version "28.4.0" + resolved "https://registry.yarnpkg.com/eslint-plugin-jest/-/eslint-plugin-jest-28.4.0.tgz#213be88f799a35ca9d63ce1a30081bb32b8da765" + integrity sha512-ORVHiFPC8RQxHLyQJ37MxNilK9k+cPzjHz65T8gAbpYZunGutXvKqwfM3WXBCvFDF1QBeYJJu9LB/i5cuXBs+g== dependencies: - "@typescript-eslint/utils" "^5.10.0" + "@typescript-eslint/utils" "^6.0.0" eslint-plugin-unicorn@52.0.0: version "52.0.0" @@ -3059,7 +3072,7 @@ eslint-plugin-unicorn@52.0.0: semver "^7.5.4" strip-indent "^3.0.0" -eslint-scope@5.1.1, eslint-scope@^5.1.1: +eslint-scope@5.1.1: version "5.1.1" resolved "https://registry.yarnpkg.com/eslint-scope/-/eslint-scope-5.1.1.tgz#e786e59a66cb92b3f6c1fb0d508aab174848f48c" integrity sha512-2NxwbF/hZ0KpepYN0cNbo+FN6XoK7GaHlQhgx/hIZl6Va0bF45RQOOwhLIy8lQDbuCiadSLCBnH2CFYquit5bw== @@ -4018,6 +4031,11 @@ human-signals@^2.1.0: resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0" integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw== +hyperdyperid@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/hyperdyperid/-/hyperdyperid-1.2.0.tgz#59668d323ada92228d2a869d3e474d5a33b69e6b" + integrity sha512-Y93lCzHYgGWdrJ66yIktxiaGULYc6oGiABxhcO5AufBeOyoIdZF7bIfLaOrbM0iGIOXQQgxxRrFEnb+Y6w1n4A== + ignore@^5.2.0, ignore@^5.2.4, ignore@^5.3.1: version "5.3.1" resolved "https://registry.yarnpkg.com/ignore/-/ignore-5.3.1.tgz#5073e554cd42c5b33b394375f538b8593e34d4ef" @@ -5165,11 +5183,14 @@ matchdep@^2.0.0: resolve "^1.4.0" stack-trace "0.0.10" -memfs@4.8.2: - version "4.8.2" - resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.8.2.tgz#9bb7c3e43647348451082557f05fb170b7442949" - integrity sha512-j4WKth315edViMBGkHW6NTF0QBjsTrcRDmYNcGsPq+ozMEyCCCIlX2d2mJ5wuh6iHvJ3FevUrr48v58YRqVdYg== +memfs@4.9.2: + version "4.9.2" + resolved "https://registry.yarnpkg.com/memfs/-/memfs-4.9.2.tgz#42e7b48207268dad8c9c48ea5d4952c5d3840433" + integrity sha512-f16coDZlTG1jskq3mxarwB+fGRrd0uXWt+o1WIhRfOwbXQZqUDsTVxQBFK9JjRQHblg8eAG2JSbprDXKjc7ijQ== dependencies: + "@jsonjoy.com/json-pack" "^1.0.3" + "@jsonjoy.com/util" "^1.1.2" + sonic-forest "^1.0.0" tslib "^2.0.0" memoizee@0.4.X: @@ -5271,6 +5292,13 @@ min-indent@^1.0.0, min-indent@^1.0.1: resolved "https://registry.yarnpkg.com/min-indent/-/min-indent-1.0.1.tgz#a63f681673b30571fbe8bc25686ae746eefa9869" integrity sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg== +minimatch@9.0.3: + version "9.0.3" + resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-9.0.3.tgz#a6e00c3de44c3a542bfaae70abfc22420a6da825" + integrity sha512-RHiac9mvaRw0x3AYRgDC1CxAP7HTcNrrECeA8YYJeWnpo+2Q5CegtZjaotWTWxDG3UeGA1coE05iH1mPjT/2mg== + dependencies: + brace-expansion "^2.0.1" + minimatch@^3.0.4, minimatch@^3.0.5, minimatch@^3.1.1, minimatch@^3.1.2: version "3.1.2" resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b" @@ -6163,29 +6191,29 @@ rimraf@^3.0.2: dependencies: glob "^7.1.3" -rollup@4.14.3: - version "4.14.3" - resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.14.3.tgz#bcbb7784b35826d3164346fa6d5aac95190d8ba9" - integrity sha512-ag5tTQKYsj1bhrFC9+OEWqb5O6VYgtQDO9hPDBMmIbePwhfSr+ExlcU741t8Dhw5DkPCQf6noz0jb36D6W9/hw== +rollup@4.17.2: + version "4.17.2" + resolved "https://registry.yarnpkg.com/rollup/-/rollup-4.17.2.tgz#26d1785d0144122277fdb20ab3a24729ae68301f" + integrity sha512-/9ClTJPByC0U4zNLowV1tMBe8yMEAxewtR3cUNX5BoEpGH3dQEWpJLr6CLp0fPdYRF/fzVOgvDb1zXuakwF5kQ== dependencies: "@types/estree" "1.0.5" optionalDependencies: - "@rollup/rollup-android-arm-eabi" "4.14.3" - "@rollup/rollup-android-arm64" "4.14.3" - "@rollup/rollup-darwin-arm64" "4.14.3" - "@rollup/rollup-darwin-x64" "4.14.3" - "@rollup/rollup-linux-arm-gnueabihf" "4.14.3" - "@rollup/rollup-linux-arm-musleabihf" "4.14.3" - "@rollup/rollup-linux-arm64-gnu" "4.14.3" - "@rollup/rollup-linux-arm64-musl" "4.14.3" - "@rollup/rollup-linux-powerpc64le-gnu" "4.14.3" - "@rollup/rollup-linux-riscv64-gnu" "4.14.3" - "@rollup/rollup-linux-s390x-gnu" "4.14.3" - "@rollup/rollup-linux-x64-gnu" "4.14.3" - "@rollup/rollup-linux-x64-musl" "4.14.3" - "@rollup/rollup-win32-arm64-msvc" "4.14.3" - "@rollup/rollup-win32-ia32-msvc" "4.14.3" - "@rollup/rollup-win32-x64-msvc" "4.14.3" + "@rollup/rollup-android-arm-eabi" "4.17.2" + "@rollup/rollup-android-arm64" "4.17.2" + "@rollup/rollup-darwin-arm64" "4.17.2" + "@rollup/rollup-darwin-x64" "4.17.2" + "@rollup/rollup-linux-arm-gnueabihf" "4.17.2" + "@rollup/rollup-linux-arm-musleabihf" "4.17.2" + "@rollup/rollup-linux-arm64-gnu" "4.17.2" + "@rollup/rollup-linux-arm64-musl" "4.17.2" + "@rollup/rollup-linux-powerpc64le-gnu" "4.17.2" + "@rollup/rollup-linux-riscv64-gnu" "4.17.2" + "@rollup/rollup-linux-s390x-gnu" "4.17.2" + "@rollup/rollup-linux-x64-gnu" "4.17.2" + "@rollup/rollup-linux-x64-musl" "4.17.2" + "@rollup/rollup-win32-arm64-msvc" "4.17.2" + "@rollup/rollup-win32-ia32-msvc" "4.17.2" + "@rollup/rollup-win32-x64-msvc" "4.17.2" fsevents "~2.3.2" run-parallel@^1.1.9: @@ -6250,7 +6278,7 @@ semver@^6.3.0, semver@^6.3.1: resolved "https://registry.yarnpkg.com/semver/-/semver-6.3.1.tgz#556d2ef8689146e46dcea4bfdd095f3434dffcb4" integrity sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA== -semver@^7.3.4, semver@^7.3.7, semver@^7.5.3, semver@^7.5.4, semver@^7.6.0: +semver@^7.3.4, semver@^7.5.3, semver@^7.5.4, semver@^7.6.0: version "7.6.0" resolved "https://registry.yarnpkg.com/semver/-/semver-7.6.0.tgz#1a46a4db4bffcccd97b743b5005c8325f23d4e2d" integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg== @@ -6386,6 +6414,13 @@ snapdragon@^0.8.1: source-map-resolve "^0.5.0" use "^3.1.0" +sonic-forest@^1.0.0: + version "1.0.2" + resolved "https://registry.yarnpkg.com/sonic-forest/-/sonic-forest-1.0.2.tgz#d80aa621d1cffe75a606ca44789ccff30f5b9ce6" + integrity sha512-2rICdwIJi5kVlehMUVtJeHn3ohh5YZV4pDv0P0c1M11cRz/gXNViItpM94HQwfvnXuzybpqK0LZJgTa3lEwtAw== + dependencies: + tree-dump "^1.0.0" + source-map-resolve@^0.5.0: version "0.5.3" resolved "https://registry.yarnpkg.com/source-map-resolve/-/source-map-resolve-0.5.3.tgz#190866bece7553e1f8f267a2ee82c606b5509a1a" @@ -6755,6 +6790,11 @@ textextensions@^3.2.0: resolved "https://registry.yarnpkg.com/textextensions/-/textextensions-3.3.0.tgz#03530d5287b86773c08b77458589148870cc71d3" integrity sha512-mk82dS8eRABNbeVJrEiN5/UMSCliINAuz8mkUwH4SwslkNP//gbEzlWNS5au0z5Dpx40SQxzqZevZkn+WYJ9Dw== +thingies@^1.20.0: + version "1.21.0" + resolved "https://registry.yarnpkg.com/thingies/-/thingies-1.21.0.tgz#e80fbe58fd6fdaaab8fad9b67bd0a5c943c445c1" + integrity sha512-hsqsJsFMsV+aD4s3CWKk85ep/3I9XzYV/IXaSouJMYIoDlgyi11cBhsqYe9/geRfB0YIikBQg6raRaM+nIMP9g== + through2-filter@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/through2-filter/-/through2-filter-3.0.0.tgz#700e786df2367c2c88cd8aa5be4cf9c1e7831254" @@ -6866,12 +6906,17 @@ totalist@^3.0.0: resolved "https://registry.yarnpkg.com/totalist/-/totalist-3.0.1.tgz#ba3a3d600c915b1a97872348f79c127475f6acf8" integrity sha512-sf4i37nQ2LBx4m3wB74y+ubopq6W/dIzXg0FDGjsYnZHVa1Da8FH853wlL2gtUhg+xJXjfk3kUZS3BRoQeoQBQ== +tree-dump@^1.0.0: + version "1.0.1" + resolved "https://registry.yarnpkg.com/tree-dump/-/tree-dump-1.0.1.tgz#b448758da7495580e6b7830d6b7834fca4c45b96" + integrity sha512-WCkcRBVPSlHHq1dc/px9iOfqklvzCbdRwvlNfxGZsrHqf6aZttfPrd7DJTt6oR10dwUfpFFQeVTkPbBIZxX/YA== + trim-newlines@^4.0.2: version "4.1.1" resolved "https://registry.yarnpkg.com/trim-newlines/-/trim-newlines-4.1.1.tgz#28c88deb50ed10c7ba6dc2474421904a00139125" integrity sha512-jRKj0n0jXWo6kh62nA5TEh3+4igKDXLvzBJcPpiizP7oOolUrYIxmVBG9TOtHYFHoddUk6YvAkGeGoSVTXfQXQ== -ts-api-utils@^1.3.0: +ts-api-utils@^1.0.1, ts-api-utils@^1.3.0: version "1.3.0" resolved "https://registry.yarnpkg.com/ts-api-utils/-/ts-api-utils-1.3.0.tgz#4b490e27129f1e8e686b45cc4ab63714dc60eea1" integrity sha512-UQMIo7pb8WRomKR1/+MFVLTroIvDVtMX3K6OUir8ynLyzB8Jeriont2bTAtmNPa1ekAgN7YPDyf6V+ygrdU+eQ== @@ -6909,23 +6954,11 @@ ts-node@10.9.2: v8-compile-cache-lib "^3.0.1" yn "3.1.1" -tslib@^1.8.1: - version "1.14.1" - resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.14.1.tgz#cf2d38bdc34a134bcaf1091c41f6619e2f672d00" - integrity sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg== - tslib@^2.0.0, tslib@^2.1.0, tslib@^2.3.0, tslib@^2.4.0, tslib@^2.6.2: version "2.6.2" resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae" integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q== -tsutils@^3.21.0: - version "3.21.0" - resolved "https://registry.yarnpkg.com/tsutils/-/tsutils-3.21.0.tgz#b48717d394cea6c1e096983eed58e9d61715b623" - integrity sha512-mHKK3iUXL+3UF6xL5k0PEhKRUBKPBCv/+RkEOpjRWxxx27KKRBmmA60A9pgOUvMi8GKhRMPEmjBRPzs2W7O1OA== - dependencies: - tslib "^1.8.1" - type-check@^0.4.0, type-check@~0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1" diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/array.cc b/matlab/src/cpp/arrow/matlab/c/proxy/array.cc new file mode 100644 index 0000000000000..a5f3418f1bcfa --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/array.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/c/abi.h" + +#include "arrow/matlab/c/proxy/array.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +Array::Array() : arrowArray{} { REGISTER_METHOD(Array, getAddress); } + +Array::~Array() { + if (arrowArray.release != NULL) { + arrowArray.release(&arrowArray); + arrowArray.release = NULL; + } +} + +libmexclass::proxy::MakeResult Array::make( + const libmexclass::proxy::FunctionArguments& constructor_arguments) { + return std::make_shared(); +} + +void Array::getAddress(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + + mda::ArrayFactory factory; + auto address = reinterpret_cast(&arrowArray); + context.outputs[0] = factory.createScalar(address); +} + +} // namespace arrow::matlab::c::proxy \ No newline at end of file diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/array.h b/matlab/src/cpp/arrow/matlab/c/proxy/array.h new file mode 100644 index 0000000000000..bb35807fcd015 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/array.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/c/abi.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +class Array : public libmexclass::proxy::Proxy { + public: + Array(); + + ~Array(); + + static libmexclass::proxy::MakeResult make( + const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void getAddress(libmexclass::proxy::method::Context& context); + + struct ArrowArray arrowArray; +}; + +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/schema.cc b/matlab/src/cpp/arrow/matlab/c/proxy/schema.cc new file mode 100644 index 0000000000000..7f239f5628720 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/schema.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include "arrow/c/abi.h" + +#include "arrow/matlab/c/proxy/schema.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +Schema::Schema() : arrowSchema{} { REGISTER_METHOD(Schema, getAddress); } + +Schema::~Schema() { + if (arrowSchema.release != NULL) { + arrowSchema.release(&arrowSchema); + arrowSchema.release = NULL; + } +} + +libmexclass::proxy::MakeResult Schema::make( + const libmexclass::proxy::FunctionArguments& constructor_arguments) { + return std::make_shared(); +} + +void Schema::getAddress(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + + mda::ArrayFactory factory; + auto address = reinterpret_cast(&arrowSchema); + context.outputs[0] = factory.createScalar(address); +} + +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/schema.h b/matlab/src/cpp/arrow/matlab/c/proxy/schema.h new file mode 100644 index 0000000000000..8f781ea9c7341 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/schema.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/c/abi.h" + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +class Schema : public libmexclass::proxy::Proxy { + public: + Schema(); + + ~Schema(); + + static libmexclass::proxy::MakeResult make( + const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void getAddress(libmexclass::proxy::method::Context& context); + + struct ArrowSchema arrowSchema; +}; + +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index 23492f75deacc..d7a8fa9ac2e74 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -25,6 +25,8 @@ #include "arrow/matlab/array/proxy/time64_array.h" #include "arrow/matlab/array/proxy/timestamp_array.h" #include "arrow/matlab/buffer/proxy/buffer.h" +#include "arrow/matlab/c/proxy/array.h" +#include "arrow/matlab/c/proxy/schema.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/io/csv/proxy/table_reader.h" #include "arrow/matlab/io/csv/proxy/table_writer.h" @@ -99,6 +101,8 @@ libmexclass::proxy::MakeResult Factory::make_proxy( REGISTER_PROXY(arrow.io.feather.proxy.Reader , arrow::matlab::io::feather::proxy::Reader); REGISTER_PROXY(arrow.io.csv.proxy.TableWriter , arrow::matlab::io::csv::proxy::TableWriter); REGISTER_PROXY(arrow.io.csv.proxy.TableReader , arrow::matlab::io::csv::proxy::TableReader); + REGISTER_PROXY(arrow.c.proxy.Array , arrow::matlab::c::proxy::Array); + REGISTER_PROXY(arrow.c.proxy.Schema , arrow::matlab::c::proxy::Schema); // clang-format on return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, diff --git a/matlab/src/matlab/+arrow/+c/Array.m b/matlab/src/matlab/+arrow/+c/Array.m new file mode 100644 index 0000000000000..574fca9afebd8 --- /dev/null +++ b/matlab/src/matlab/+arrow/+c/Array.m @@ -0,0 +1,37 @@ +%ARRAY Wrapper for an Arrow C Data Interface format ArrowArray C struct pointer. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef Array < matlab.mixin.Scalar + + properties (Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent, GetAccess=public, SetAccess=private) + Address(1, 1) uint64 + end + + methods + function obj = Array() + proxyName = "arrow.c.proxy.Array"; + obj.Proxy = arrow.internal.proxy.create(proxyName); + end + + function address = get.Address(obj) + address = obj.Proxy.getAddress(); + end + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+c/Schema.m b/matlab/src/matlab/+arrow/+c/Schema.m new file mode 100644 index 0000000000000..29eba59016044 --- /dev/null +++ b/matlab/src/matlab/+arrow/+c/Schema.m @@ -0,0 +1,37 @@ +%SCHEMA Wrapper for an Arrow C Data Interface format ArrowSchema C struct pointer. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef Schema < matlab.mixin.Scalar + + properties (Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent, GetAccess=public, SetAccess=private) + Address(1, 1) uint64 + end + + methods + function obj = Schema() + proxyName = "arrow.c.proxy.Schema"; + obj.Proxy = arrow.internal.proxy.create(proxyName); + end + + function address = get.Address(obj) + address = obj.Proxy.getAddress(); + end + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m index 79065ba1c8cfd..e99dd7d78488d 100644 --- a/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m +++ b/matlab/src/matlab/+arrow/+internal/+test/+display/makeLinkString.m @@ -26,11 +26,11 @@ end if opts.BoldFont - link = compose("%s", ... opts.FullClassName, opts.ClassName); else - link = compose("%s", ... + link = compose("%s", ... opts.FullClassName, opts.ClassName); end end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m index 7da945ca993ef..724b4873c92e1 100644 --- a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m @@ -43,7 +43,7 @@ classNameAndIDs = strings([1 numel(typeIDs) * 2]); classNameAndIDs(1:2:end-1) = classNames; classNameAndIDs(2:2:end) = typeIDs; - typeIDs = compose("%s", classNameAndIDs); + typeIDs = compose("%s", classNameAndIDs); end text = names + ": " + typeIDs; diff --git a/matlab/test/arrow/c/tArray.m b/matlab/test/arrow/c/tArray.m new file mode 100644 index 0000000000000..f8caf48065114 --- /dev/null +++ b/matlab/test/arrow/c/tArray.m @@ -0,0 +1,48 @@ +%TARRAY Defines unit tests for arrow.c.Array. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tArray < matlab.unittest.TestCase + + methods (Test) + function TestClassStructure(testCase) + array = arrow.c.Array(); + + % Verify array is an instance of arrow.c.Array. + testCase.verifyInstanceOf(array, "arrow.c.Array"); + + % Verify array has one public property named Address. + props = properties(array); + testCase.verifyEqual(props, {'Address'}); + end + + function TestAddressProperty(testCase) + array = arrow.c.Array(); + + % It's impossible to know what the value of Address will be. + % Just verify Address is a scalar uint64. + address = array.Address; + testCase.verifyInstanceOf(address, "uint64"); + testCase.verifyTrue(isscalar(address)); + end + + function TestAddressNoSetter(testCase) + % Verify the Address property is read-only. + array = arrow.c.Array(); + fcn = @() setfield(array, "Address", uint64(10)); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + end +end \ No newline at end of file diff --git a/matlab/test/arrow/c/tSchema.m b/matlab/test/arrow/c/tSchema.m new file mode 100644 index 0000000000000..16dcf1965b463 --- /dev/null +++ b/matlab/test/arrow/c/tSchema.m @@ -0,0 +1,48 @@ +%TSCHEMA Defines unit tests for arrow.c.Schema. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tSchema < matlab.unittest.TestCase + + methods (Test) + function TestClassStructure(testCase) + schema = arrow.c.Schema(); + + % Verify schema is an instance of arrow.c.Schema. + testCase.verifyInstanceOf(schema, "arrow.c.Schema"); + + % Verify schema has one public property named Address. + props = properties(schema); + testCase.verifyEqual(props, {'Address'}); + end + + function TestAddressProperty(testCase) + schema = arrow.c.Schema(); + + % It's impossible to know what the value of Address will be. + % Just verify Address is a scalar uint64. + address = schema.Address; + testCase.verifyInstanceOf(address, "uint64"); + testCase.verifyTrue(isscalar(address)); + end + + function TestAddressNoSetter(testCase) + % Verify the Address property is read-only. + schema = arrow.c.Schema(); + fcn = @() setfield(schema, "Address", uint64(10)); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + end +end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index cb746e08b1f8e..8f37bef77b859 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -24,8 +24,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_NAME libmexclass) # libmexclass is accessible for CI without permission issues. set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_REPOSITORY "https://github.com/mathworks/libmexclass.git") # Use a specific Git commit hash to avoid libmexclass version changing unexpectedly. -set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "d04f88d") - +set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_GIT_TAG "ca3cea6") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_FETCH_CONTENT_SOURCE_SUBDIR "libmexclass/cpp") # ------------------------------------------ @@ -76,7 +75,9 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/index/validate.cc" - "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer/proxy/buffer.cc") + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer/proxy/buffer.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/array.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/schema.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") diff --git a/matlab/tools/packageMatlabInterface.m b/matlab/tools/packageMatlabInterface.m index 55b4d4241a569..3d970002614ab 100644 --- a/matlab/tools/packageMatlabInterface.m +++ b/matlab/tools/packageMatlabInterface.m @@ -55,9 +55,18 @@ opts.SupportedPlatforms.Glnxa64 = true; opts.SupportedPlatforms.MatlabOnline = true; -% Interface is only qualified against R2023a at the moment -opts.MinimumMatlabRelease = "R2023a"; -opts.MaximumMatlabRelease = "R2023a"; +% MEX files use run-time libraries shipped with MATLAB (e.g. libmx, libmex, +% etc.). MEX files linked against earlier versions of MALTAB run-time libraries +% will most likely work on newer versions of MATLAB. However, this may not +% always be the case. +% +% For now, set the earliest and latest compatible releases of MATLAB to +% the release of MATLAB used to build and package the MATLAB Arrow Interface. +% +% See: https://www.mathworks.com/help/matlab/matlab_external/version-compatibility.html +currentRelease = matlabRelease.Release; +opts.MinimumMatlabRelease = currentRelease; +opts.MaximumMatlabRelease = currentRelease; opts.OutputFile = fullfile(outputFolder, compose("matlab-arrow-%s.mltbx", toolboxVersionRaw)); disp("Output File: " + opts.OutputFile); diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 212862357ace2..07acb9e31a731 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -108,25 +108,6 @@ if(UNIX) endif() endif() -# Top level cmake dir -if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") - option(PYARROW_BUILD_ACERO "Build the PyArrow Acero integration" OFF) - option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF) - option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF) - option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF) - option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF) - option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF) - option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF) - option(PYARROW_BUILD_PARQUET_ENCRYPTION - "Build the PyArrow Parquet encryption integration" OFF) - option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) - option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) - option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) - set(PYARROW_CXXFLAGS - "" - CACHE STRING "Compiler flags to append when compiling Arrow") -endif() - find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND AND NOT CMAKE_C_COMPILER_LAUNCHER @@ -265,11 +246,70 @@ message(STATUS "NumPy include dir: ${NUMPY_INCLUDE_DIRS}") include(UseCython) -# PyArrow C++ +# Arrow C++ and set default PyArrow build options include(GNUInstallDirs) - find_package(Arrow REQUIRED) +macro(define_option name description arrow_option) + set("PYARROW_${name}" + "AUTO" + CACHE STRING ${description}) + + if("${PYARROW_${name}}" STREQUAL "AUTO") + # by default, first check if env variable exists, otherwise use Arrow C++ config + set(env_variable "PYARROW_WITH_${name}") + if(DEFINED ENV{${env_variable}}) + if($ENV{${env_variable}}) + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + else() + if(${arrow_option}) + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + endif() + else() + if("${PYARROW_${name}}") + set("PYARROW_BUILD_${name}" ON) + else() + set("PYARROW_BUILD_${name}" OFF) + endif() + endif() +endmacro() + +define_option(ACERO "Build the PyArrow Acero integration" ARROW_ACERO) +define_option(CUDA "Build the PyArrow CUDA support" ARROW_CUDA) +define_option(DATASET "Build the PyArrow Dataset integration" ARROW_DATASET) +define_option(FLIGHT "Build the PyArrow Flight integration" ARROW_FLIGHT) +define_option(GANDIVA "Build the PyArrow Gandiva integration" ARROW_GANDIVA) +define_option(ORC "Build the PyArrow ORC integration" ARROW_ORC) +define_option(PARQUET "Build the PyArrow Parquet integration" ARROW_PARQUET) +define_option(PARQUET_ENCRYPTION "Build the PyArrow Parquet encryption integration" + PARQUET_REQUIRE_ENCRYPTION) +define_option(SUBSTRAIT "Build the PyArrow Substrait integration" ARROW_SUBSTRAIT) +define_option(AZURE "Build the PyArrow Azure integration" ARROW_AZURE) +define_option(GCS "Build the PyArrow GCS integration" ARROW_GCS) +define_option(S3 "Build the PyArrow S3 integration" ARROW_S3) +define_option(HDFS "Build the PyArrow HDFS integration" ARROW_HDFS) +option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF) +option(PYARROW_BUNDLE_CYTHON_CPP "Bundle the C++ files generated by Cython" OFF) +option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF) +set(PYARROW_CXXFLAGS + "" + CACHE STRING "Compiler flags to append when compiling PyArrow C++") + +# enforce module dependencies +if(PYARROW_BUILD_SUBSTRAIT) + set(PYARROW_BUILD_DATASET ON) +endif() +if(PYARROW_BUILD_DATASET) + set(PYARROW_BUILD_ACERO ON) +endif() + +# PyArrow C++ set(PYARROW_CPP_ROOT_DIR pyarrow/src) set(PYARROW_CPP_SOURCE_DIR ${PYARROW_CPP_ROOT_DIR}/arrow/python) set(PYARROW_CPP_SRCS @@ -305,6 +345,7 @@ set(PYARROW_CPP_LINK_LIBS "") # Check all the options from Arrow and PyArrow C++ to be in line if(PYARROW_BUILD_DATASET) + message(STATUS "Building PyArrow with Dataset") if(NOT ARROW_DATASET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_DATASET=ON") endif() @@ -317,6 +358,7 @@ if(PYARROW_BUILD_DATASET) endif() if(PYARROW_BUILD_ACERO) + message(STATUS "Building PyArrow with Acero") if(NOT ARROW_ACERO) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ACERO=ON") endif() @@ -329,18 +371,13 @@ if(PYARROW_BUILD_ACERO) endif() if(PYARROW_BUILD_PARQUET OR PYARROW_BUILD_PARQUET_ENCRYPTION) + message(STATUS "Building PyArrow with Parquet") if(NOT ARROW_PARQUET) message(FATAL_ERROR "You must build Arrow C++ with ARROW_PARQUET=ON") endif() find_package(Parquet REQUIRED) endif() -if(PYARROW_BUILD_HDFS) - if(NOT ARROW_HDFS) - message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") - endif() -endif() - # Check for only Arrow C++ options if(ARROW_CSV) list(APPEND PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/csv.cc) @@ -400,6 +437,7 @@ endif() set(PYARROW_CPP_FLIGHT_SRCS ${PYARROW_CPP_SOURCE_DIR}/flight.cc) if(PYARROW_BUILD_FLIGHT) + message(STATUS "Building PyArrow with Flight") if(NOT ARROW_FLIGHT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_FLIGHT=ON") endif() @@ -555,23 +593,39 @@ set_source_files_properties(pyarrow/lib.pyx PROPERTIES CYTHON_API TRUE) set(LINK_LIBS arrow_python) if(PYARROW_BUILD_AZURE) + message(STATUS "Building PyArrow with Azure") + if(NOT ARROW_AZURE) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_AZURE=ON") + endif() list(APPEND CYTHON_EXTENSIONS _azurefs) endif() if(PYARROW_BUILD_GCS) + message(STATUS "Building PyArrow with GCS") + if(NOT ARROW_GCS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_GCS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _gcsfs) endif() if(PYARROW_BUILD_S3) + message(STATUS "Building PyArrow with S3") + if(NOT ARROW_S3) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_S3=ON") + endif() list(APPEND CYTHON_EXTENSIONS _s3fs) endif() if(PYARROW_BUILD_HDFS) + message(STATUS "Building PyArrow with HDFS") + if(NOT ARROW_HDFS) + message(FATAL_ERROR "You must build Arrow C++ with ARROW_HDFS=ON") + endif() list(APPEND CYTHON_EXTENSIONS _hdfs) endif() if(PYARROW_BUILD_CUDA) - # Arrow CUDA + message(STATUS "Building PyArrow with CUDA") if(NOT ARROW_CUDA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_CUDA=ON") endif() @@ -646,8 +700,9 @@ if(PYARROW_BUILD_PARQUET) endif() endif() +# ORC if(PYARROW_BUILD_ORC) - # ORC + message(STATUS "Building PyArrow with ORC") if(NOT ARROW_ORC) message(FATAL_ERROR "You must build Arrow C++ with ARROW_ORC=ON") endif() @@ -679,6 +734,7 @@ endif() # Substrait if(PYARROW_BUILD_SUBSTRAIT) + message(STATUS "Building PyArrow with Substrait") if(NOT ARROW_SUBSTRAIT) message(FATAL_ERROR "You must build Arrow C++ with ARROW_SUBSTRAIT=ON") endif() @@ -696,6 +752,7 @@ endif() # Gandiva if(PYARROW_BUILD_GANDIVA) + message(STATUS "Building PyArrow with Gandiva") if(NOT ARROW_GANDIVA) message(FATAL_ERROR "You must build Arrow C++ with ARROW_GANDIVA=ON") endif() diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index b36264a755b8c..15a26a1705329 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2035,6 +2035,26 @@ class PairwiseOptions(_PairwiseOptions): self._set_options(period) +cdef class _ListFlattenOptions(FunctionOptions): + def _set_options(self, recursive): + self.wrapped.reset(new CListFlattenOptions(recursive)) + + +class ListFlattenOptions(_ListFlattenOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + + def __init__(self, recursive=False): + self._set_options(recursive) + + cdef class _ArraySortOptions(FunctionOptions): def _set_options(self, order, null_placement): self.wrapped.reset(new CArraySortOptions( diff --git a/python/pyarrow/_dataset_parquet.pxd b/python/pyarrow/_dataset_parquet.pxd index d5bc172d324d5..0a3a2ff526ea4 100644 --- a/python/pyarrow/_dataset_parquet.pxd +++ b/python/pyarrow/_dataset_parquet.pxd @@ -29,6 +29,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cdef: CParquetFragmentScanOptions* parquet_options object _parquet_decryption_config + object _decryption_properties cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp) cdef CReaderProperties* reader_properties(self) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index a55e889ba8246..4942336a12666 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -56,7 +56,7 @@ from pyarrow._parquet cimport ( try: from pyarrow._dataset_parquet_encryption import ( - set_encryption_config, set_decryption_config + set_encryption_config, set_decryption_config, set_decryption_properties ) parquet_encryption_enabled = True except ImportError: @@ -127,8 +127,7 @@ cdef class ParquetFileFormat(FileFormat): 'instance of ParquetReadOptions') if default_fragment_scan_options is None: - default_fragment_scan_options = ParquetFragmentScanOptions( - **scan_args) + default_fragment_scan_options = ParquetFragmentScanOptions(**scan_args) elif isinstance(default_fragment_scan_options, dict): default_fragment_scan_options = ParquetFragmentScanOptions( **default_fragment_scan_options) @@ -715,6 +714,9 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None If not None, use the provided ParquetDecryptionConfig to decrypt the Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. """ @@ -729,6 +731,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_string_size_limit=None, thrift_container_size_limit=None, decryption_config=None, + decryption_properties=None, bint page_checksum_verification=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) @@ -743,6 +746,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): self.thrift_container_size_limit = thrift_container_size_limit if decryption_config is not None: self.parquet_decryption_config = decryption_config + if decryption_properties is not None: + self.decryption_properties = decryption_properties self.page_checksum_verification = page_checksum_verification cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): @@ -812,6 +817,25 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): raise ValueError("size must be larger than zero") self.reader_properties().set_thrift_container_size_limit(size) + @property + def decryption_properties(self): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Unable to access encryption features. " + "Encryption is not enabled in your installation of pyarrow." + ) + return self._decryption_properties + + @decryption_properties.setter + def decryption_properties(self, config): + if not parquet_encryption_enabled: + raise NotImplementedError( + "Encryption is not enabled in your installation of pyarrow, but " + "decryption_properties were provided." + ) + set_decryption_properties(self, config) + self._decryption_properties = config + @property def parquet_decryption_config(self): if not parquet_encryption_enabled: diff --git a/python/pyarrow/_dataset_parquet_encryption.pyx b/python/pyarrow/_dataset_parquet_encryption.pyx index 11a7174eb3c9d..c8f5e5b01bf81 100644 --- a/python/pyarrow/_dataset_parquet_encryption.pyx +++ b/python/pyarrow/_dataset_parquet_encryption.pyx @@ -162,6 +162,14 @@ def set_encryption_config( opts.parquet_options.parquet_encryption_config = c_config +def set_decryption_properties( + ParquetFragmentScanOptions opts not None, + FileDecryptionProperties config not None +): + cdef CReaderProperties* reader_props = opts.reader_properties() + reader_props.file_decryption_properties(config.unwrap()) + + def set_decryption_config( ParquetFragmentScanOptions opts not None, ParquetDecryptionConfig config not None diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 0e635b2c8a28a..dbfb6ed114553 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -18,6 +18,7 @@ # cython: language_level = 3 from cpython.datetime cimport datetime, PyDateTime_DateTime +from cython cimport binding from pyarrow.includes.common cimport * from pyarrow.includes.libarrow_python cimport PyDateTime_to_TimePoint @@ -421,6 +422,7 @@ cdef class FileSystem(_Weakrefable): "SubTreeFileSystem") @staticmethod + @binding(True) # Required for cython < 3 def _from_uri(uri): fs, _path = FileSystem.from_uri(uri) return fs diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 60fc09ea861b6..406830ad4dd69 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2141,22 +2141,99 @@ cdef class Decimal256Array(FixedSizeBinaryArray): cdef class BaseListArray(Array): - def flatten(self): + def flatten(self, recursive=False): """ - Unnest this ListArray/LargeListArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. + Unnest this [Large]ListArray/[Large]ListViewArray/FixedSizeListArray + according to 'recursive'. Note that this method is different from ``self.values`` in that it takes care of the slicing offset as well as null elements backed by non-empty sub-lists. + Parameters + ---------- + recursive : bool, default False, optional + When True, flatten this logical list-array recursively until an + array of non-list values is formed. + + When False, flatten only the top level. + Returns ------- result : Array + + Examples + -------- + + Basic logical list-array's flatten + >>> import pyarrow as pa + >>> values = [1, 2, 3, 4] + >>> offsets = [2, 1, 0] + >>> sizes = [2, 2, 2] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 3, + 4 + ], + [ + 2, + 3 + ], + [ + 1, + 2 + ] + ] + >>> array.flatten() + + [ + 3, + 4, + 2, + 3, + 1, + 2 + ] + + When recursive=True, nested list arrays are flattened recursively + until an array of non-list values is formed. + + >>> array = pa.array([ + ... None, + ... [ + ... [1, None, 2], + ... None, + ... [3, 4] + ... ], + ... [], + ... [ + ... [], + ... [5, 6], + ... None + ... ], + ... [ + ... [7, 8] + ... ] + ... ], type=pa.list_(pa.list_(pa.int64()))) + >>> array.flatten(True) + + [ + 1, + null, + 2, + 3, + 4, + 5, + 6, + 7, + 8 + ] """ - return _pc().list_flatten(self) + options = _pc().ListFlattenOptions(recursive) + return _pc().list_flatten(self, options=options) def value_parent_indices(self): """ @@ -2527,7 +2604,7 @@ cdef class LargeListArray(BaseListArray): return pyarrow_wrap_array(( self.ap).offsets()) -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a list view data type. """ @@ -2747,69 +2824,8 @@ cdef class ListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this ListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): """ Concrete class for Arrow arrays of a large list view data type. @@ -3037,67 +3053,6 @@ cdef class LargeListViewArray(Array): """ return pyarrow_wrap_array(( self.ap).sizes()) - def flatten(self, memory_pool=None): - """ - Unnest this LargeListViewArray by one level. - - The returned Array is logically a concatenation of all the sub-lists - in this Array. - - Note that this method is different from ``self.values`` in that - it takes care of the slicing offset as well as null elements backed - by non-empty sub-lists. - - Parameters - ---------- - memory_pool : MemoryPool, optional - - Returns - ------- - result : Array - - Examples - -------- - - >>> import pyarrow as pa - >>> values = [1, 2, 3, 4] - >>> offsets = [2, 1, 0] - >>> sizes = [2, 2, 2] - >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) - >>> array - - [ - [ - 3, - 4 - ], - [ - 2, - 3 - ], - [ - 1, - 2 - ] - ] - >>> array.flatten() - - [ - 3, - 4, - 2, - 3, - 1, - 2 - ] - """ - cdef CMemoryPool* cpool = maybe_unbox_memory_pool(memory_pool) - with nogil: - out = GetResultValue(( self.ap).Flatten(cpool)) - cdef Array result = pyarrow_wrap_array(out) - result.validate() - return result - cdef class MapArray(ListArray): """ @@ -3965,12 +3920,11 @@ cdef class StructArray(Array): result : StructArray """ if by is not None: - tosort = self._flattened_field(by) + tosort, sort_keys = self._flattened_field(by), [("", order)] else: - tosort = self + tosort, sort_keys = self, [(field.name, order) for field in self.type] indices = _pc().sort_indices( - tosort, - options=_pc().SortOptions(sort_keys=[("", order)], **kwargs) + tosort, options=_pc().SortOptions(sort_keys=sort_keys, **kwargs) ) return self.take(indices) @@ -4029,7 +3983,7 @@ cdef class RunEndEncodedArray(Array): ------- RunEndEncodedArray """ - logical_length = run_ends[-1] if len(run_ends) > 0 else 0 + logical_length = scalar(run_ends[-1]).as_py() if len(run_ends) > 0 else 0 return RunEndEncodedArray._from_arrays(type, True, logical_length, run_ends, values, 0) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index 205ab393b8b09..83612f66d21e2 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -44,6 +44,7 @@ IndexOptions, JoinOptions, ListSliceOptions, + ListFlattenOptions, MakeStructOptions, MapLookupOptions, MatchSubstringOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 1da0a9f8858d6..cb91d2bd3d1b1 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2590,6 +2590,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CPairwiseOptions(int64_t period) int64_t period + cdef cppclass CListFlattenOptions\ + "arrow::compute::ListFlattenOptions"(CFunctionOptions): + CListFlattenOptions(c_bool recursive) + c_bool recursive + cdef cppclass CArraySortOptions \ "arrow::compute::ArraySortOptions"(CFunctionOptions): CArraySortOptions(CSortOrder, CNullPlacement) diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index 7890bf4b2dd76..9e8026deb435c 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -1446,27 +1446,6 @@ cdef class Buffer(_Weakrefable): buffer.strides = self.strides buffer.suboffsets = NULL - def __getsegcount__(self, Py_ssize_t *len_out): - if len_out != NULL: - len_out[0] = self.size - return 1 - - def __getreadbuffer__(self, Py_ssize_t idx, void **p): - if idx != 0: - raise SystemError("accessing nonexistent buffer segment") - if p != NULL: - p[0] = self.buffer.get().data() - return self.size - - def __getwritebuffer__(self, Py_ssize_t idx, void **p): - if not self.buffer.get().is_mutable(): - raise SystemError("trying to write an immutable buffer") - if idx != 0: - raise SystemError("accessing nonexistent buffer segment") - if p != NULL: - p[0] = self.buffer.get().data() - return self.size - cdef class ResizableBuffer(Buffer): """ @@ -2142,21 +2121,21 @@ cdef class CacheOptions(_Weakrefable): Parameters ---------- hole_size_limit : int, default 8KiB - The maximum distance in bytes between two consecutive ranges; beyond + The maximum distance in bytes between two consecutive ranges; beyond this value, ranges are not combined. range_size_limit : int, default 32MiB - The maximum size in bytes of a combined range; if combining two - consecutive ranges would produce a range of a size greater than this, + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, they are not combined lazy : bool, default True lazy = false: request all byte ranges when PreBuffer or WillNeed is called. - lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader - needs them. - lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the range that is currently being read. prefetch_limit : int, default 0 - The maximum number of ranges to be prefetched. This is only used for - lazy cache to asynchronously read some ranges after reading the target + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target range. """ @@ -2227,19 +2206,19 @@ cdef class CacheOptions(_Weakrefable): """ Create suiteable CacheOptions based on provided network metrics. - Typically this will be used with object storage solutions like Amazon S3, + Typically this will be used with object storage solutions like Amazon S3, Google Cloud Storage and Azure Blob Storage. Parameters ---------- time_to_first_byte_millis : int - Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call - setup latency of a new read request. The value is a positive integer. + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. transfer_bandwidth_mib_per_sec : int - Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive integer. ideal_bandwidth_utilization_frac : int, default 0.9 - Transfer bandwidth utilization fraction (per connection) to maximize the net + Transfer bandwidth utilization fraction (per connection) to maximize the net data load. The value is a positive float less than 1. max_ideal_request_size_mib : int, default 64 The maximum single data request size (in MiB) to maximize the net data load. diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index b1187a77c2a6e..bfd266a807c40 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -437,11 +437,11 @@ cdef class LargeListArray(BaseListArray): pass -cdef class ListViewArray(Array): +cdef class ListViewArray(BaseListArray): pass -cdef class LargeListViewArray(Array): +cdef class LargeListViewArray(BaseListArray): pass diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 69a1c9d19aae2..f54a203c8794c 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1299,7 +1299,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, f"local file systems, not {type(filesystem)}" ) - # check for single fragment dataset + # check for single fragment dataset or dataset directory single_file = None self._base_dir = None if not isinstance(path_or_paths, list): @@ -1313,8 +1313,6 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, except ValueError: filesystem = LocalFileSystem(use_mmap=memory_map) finfo = filesystem.get_file_info(path_or_paths) - if finfo.is_file: - single_file = path_or_paths if finfo.type == FileType.Directory: self._base_dir = path_or_paths else: @@ -1771,6 +1769,7 @@ def read_table(source, *, columns=None, use_threads=True, ignore_prefixes=ignore_prefixes, pre_buffer=pre_buffer, coerce_int96_timestamp_unit=coerce_int96_timestamp_unit, + decryption_properties=decryption_properties, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 79da47567bf24..a2a325fde8dbd 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -874,6 +874,10 @@ class PyListConverter : public ListConverter { if (PyArray_NDIM(ndarray) != 1) { return Status::Invalid("Can only convert 1-dimensional array values"); } + if (PyArray_ISBYTESWAPPED(ndarray)) { + // TODO + return Status::NotImplemented("Byte-swapped arrays not supported"); + } const int64_t size = PyArray_SIZE(ndarray); RETURN_NOT_OK(AppendTo(this->list_type_, size)); RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index edb6410d2fa0d..ff388ef506997 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -65,6 +65,44 @@ def basic_encryption_config(): return basic_encryption_config +def setup_encryption_environment(custom_kms_conf): + """ + Sets up and returns the KMS connection configuration and crypto factory + based on provided KMS configuration parameters. + """ + kms_connection_config = pe.KmsConnectionConfig(custom_kms_conf=custom_kms_conf) + + def kms_factory(kms_connection_configuration): + return InMemoryKmsClient(kms_connection_configuration) + + # Create our CryptoFactory + crypto_factory = pe.CryptoFactory(kms_factory) + + return kms_connection_config, crypto_factory + + +def write_encrypted_file(path, data_table, footer_key_name, col_key_name, + footer_key, col_key, encryption_config): + """ + Writes an encrypted parquet file based on the provided parameters. + """ + # Setup the custom KMS configuration with provided keys + custom_kms_conf = { + footer_key_name: footer_key.decode("UTF-8"), + col_key_name: col_key.decode("UTF-8"), + } + + # Setup encryption environment + kms_connection_config, crypto_factory = setup_encryption_environment( + custom_kms_conf) + + # Write the encrypted parquet file + write_encrypted_parquet(path, data_table, encryption_config, + kms_connection_config, crypto_factory) + + return kms_connection_config, crypto_factory + + def test_encrypted_parquet_write_read(tempdir, data_table): """Write an encrypted parquet, verify it's encrypted, and then read it.""" path = tempdir / PARQUET_NAME @@ -81,20 +119,10 @@ def test_encrypted_parquet_write_read(tempdir, data_table): cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + encryption_config) - crypto_factory = pe.CryptoFactory(kms_factory) - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) # Read with decryption properties @@ -150,36 +178,22 @@ def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table): cache_lifetime=timedelta(minutes=5.0), data_key_length_bits=256) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) + write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, + FOOTER_KEY, COL_KEY, encryption_config) - crypto_factory = pe.CryptoFactory(kms_factory) - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) - # Read with decryption properties - wrong_kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - # Wrong keys - mixup in names - FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), - COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - } - ) + wrong_kms_connection_config, wrong_crypto_factory = setup_encryption_environment({ + FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"), # Intentionally wrong + COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"), # Intentionally wrong + }) + decryption_config = pe.DecryptionConfiguration( cache_lifetime=timedelta(minutes=5.0)) with pytest.raises(ValueError, match=r"Incorrect master key used"): read_encrypted_parquet( path, decryption_config, wrong_kms_connection_config, - crypto_factory) + wrong_crypto_factory) def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table): @@ -219,23 +233,12 @@ def test_encrypted_parquet_write_no_col_key(tempdir, data_table): encryption_config = pe.EncryptionConfiguration( footer_key=FOOTER_KEY_NAME) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - crypto_factory = pe.CryptoFactory(kms_factory) with pytest.raises(OSError, match="Either column_keys or uniform_encryption " "must be set"): # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) + write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, + FOOTER_KEY, b"", encryption_config) def test_encrypted_parquet_write_kms_error(tempdir, data_table, @@ -497,24 +500,11 @@ def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config): # Encrypt the footer with the footer key, # encrypt column `a` and column `b` with another key, - # keep `c` plaintext - encryption_config = basic_encryption_config + # keep `c` plaintext, defined in basic_encryption_config + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - crypto_factory = pe.CryptoFactory(kms_factory) - - # Write with encryption properties - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, crypto_factory) verify_file_encrypted(path) decryption_config = pe.DecryptionConfiguration( @@ -537,32 +527,46 @@ def test_read_with_deleted_crypto_factory(tempdir, data_table, basic_encryption_ Test that decryption properties can be used if the crypto factory is no longer alive """ path = tempdir / PARQUET_NAME - encryption_config = basic_encryption_config - kms_connection_config = pe.KmsConnectionConfig( - custom_kms_conf={ - FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"), - COL_KEY_NAME: COL_KEY.decode("UTF-8"), - } - ) - - def kms_factory(kms_connection_configuration): - return InMemoryKmsClient(kms_connection_configuration) - - encryption_crypto_factory = pe.CryptoFactory(kms_factory) - write_encrypted_parquet(path, data_table, encryption_config, - kms_connection_config, encryption_crypto_factory) + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) verify_file_encrypted(path) - # Use a local function to get decryption properties, so the crypto factory that - # creates the properties will be deleted after it returns. - def get_decryption_properties(): - decryption_crypto_factory = pe.CryptoFactory(kms_factory) - decryption_config = pe.DecryptionConfiguration( - cache_lifetime=timedelta(minutes=5.0)) - return decryption_crypto_factory.file_decryption_properties( - kms_connection_config, decryption_config) + # Create decryption properties and delete the crypto factory that created + # the properties afterwards. + decryption_config = pe.DecryptionConfiguration( + cache_lifetime=timedelta(minutes=5.0)) + file_decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + del crypto_factory result = pq.ParquetFile( - path, decryption_properties=get_decryption_properties()) + path, decryption_properties=file_decryption_properties) result_table = result.read(use_threads=True) assert data_table.equals(result_table) + + +def test_encrypted_parquet_read_table(tempdir, data_table, basic_encryption_config): + """Write an encrypted parquet then read it back using read_table.""" + path = tempdir / PARQUET_NAME + + # Write the encrypted parquet file using the utility function + kms_connection_config, crypto_factory = write_encrypted_file( + path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY, + basic_encryption_config) + + decryption_config = pe.DecryptionConfiguration( + cache_lifetime=timedelta(minutes=5.0)) + file_decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + + # Read the encrypted parquet file using read_table + result_table = pq.read_table(path, decryption_properties=file_decryption_properties) + + # Assert that the read table matches the original data + assert data_table.equals(result_table) + + # Read the encrypted parquet folder using read_table + result_table = pq.read_table( + tempdir, decryption_properties=file_decryption_properties) + assert data_table.equals(result_table) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 156d58326b961..b89e0ace157af 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1099,6 +1099,30 @@ def test_map_from_arrays(): with pytest.raises(ValueError): pa.MapArray.from_arrays(offsets, keys_with_null, items) + # Check if offset in offsets > 0 + offsets = pa.array(offsets, pa.int32()) + result = pa.MapArray.from_arrays(offsets.slice(1), keys, items) + expected = pa.MapArray.from_arrays([1, 3, 5], keys, items) + + assert result.equals(expected) + assert result.offset == 1 + assert expected.offset == 0 + + offsets = pa.array([0, 0, 0, 0, 0, 0], pa.int32()) + result = pa.MapArray.from_arrays( + offsets.slice(1), + pa.array([], pa.string()), + pa.array([], pa.string()), + ) + expected = pa.MapArray.from_arrays( + [0, 0, 0, 0, 0], + pa.array([], pa.string()), + pa.array([], pa.string()), + ) + assert result.equals(expected) + assert result.offset == 1 + assert expected.offset == 0 + def test_fixed_size_list_from_arrays(): values = pa.array(range(12), pa.int64()) @@ -2757,6 +2781,7 @@ def test_list_array_flatten(offset_type, list_type_factory): assert arr1.values.equals(arr0) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) @pytest.mark.parametrize('list_type', [ @@ -2778,7 +2803,9 @@ def test_list_value_parent_indices(list_type): @pytest.mark.parametrize(('offset_type', 'list_type'), [(pa.int32(), pa.list_(pa.int32())), (pa.int32(), pa.list_(pa.int32(), list_size=2)), - (pa.int64(), pa.large_list(pa.int32()))]) + (pa.int64(), pa.large_list(pa.int32())), + (pa.int32(), pa.list_view(pa.int32())), + (pa.int64(), pa.large_list_view(pa.int32()))]) def test_list_value_lengths(offset_type, list_type): # FixedSizeListArray needs fixed list sizes @@ -2876,6 +2903,8 @@ def test_fixed_size_list_array_flatten(): assert arr0.type.equals(typ0) assert arr1.flatten().equals(arr0) assert arr2.flatten().flatten().equals(arr0) + assert arr2.flatten().equals(arr1) + assert arr2.flatten(True).equals(arr0) def test_fixed_size_list_array_flatten_with_slice(): @@ -3507,6 +3536,14 @@ def test_struct_array_sort(): {"a": 5, "b": "foo"}, ] + sorted_arr = arr.sort() + assert sorted_arr.to_pylist() == [ + {"a": 5, "b": "foo"}, + {"a": 7, "b": "bar"}, + {"a": 7, "b": "car"}, + {"a": 35, "b": "foobar"}, + ] + arr_with_nulls = pa.StructArray.from_arrays([ pa.array([5, 7, 7, 35], type=pa.int64()), pa.array(["foo", "car", "bar", "foobar"]) @@ -3573,12 +3610,23 @@ def check_run_end_encoded_from_arrays_with_type(ree_type=None): check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) +def check_run_end_encoded_from_typed_arrays(ree_type): + run_ends = [3, 5, 10, 19] + values = [1, 2, 1, 3] + typed_run_ends = pa.array(run_ends, ree_type.run_end_type) + typed_values = pa.array(values, ree_type.value_type) + ree_array = pa.RunEndEncodedArray.from_arrays(typed_run_ends, typed_values) + assert ree_array.type == ree_type + check_run_end_encoded(ree_array, run_ends, values, 19, 4, 0) + + def test_run_end_encoded_from_arrays(): check_run_end_encoded_from_arrays_with_type() for run_end_type in [pa.int16(), pa.int32(), pa.int64()]: for value_type in [pa.uint32(), pa.int32(), pa.uint64(), pa.int64()]: ree_type = pa.run_end_encoded(run_end_type, value_type) check_run_end_encoded_from_arrays_with_type(ree_type) + check_run_end_encoded_from_typed_arrays(ree_type) def test_run_end_encoded_from_buffers(): @@ -3844,6 +3892,7 @@ def test_list_view_flatten(list_array_type, list_type_factory, offset_type): assert arr2.values.equals(arr1) assert arr2.flatten().flatten().equals(arr0) assert arr2.values.values.equals(arr0) + assert arr2.flatten(True).equals(arr0) # test out of order offsets values = [1, 2, 3, 4] @@ -3879,3 +3928,27 @@ def test_list_view_slice(list_view_type): j = sliced_array.offsets[1].as_py() assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() == [4] + + +@pytest.mark.parametrize('numpy_native_dtype', ['u2', 'i4', 'f8']) +def test_swapped_byte_order_fails(numpy_native_dtype): + # ARROW-39129 + + numpy_swapped_dtype = np.dtype(numpy_native_dtype).newbyteorder() + np_arr = np.arange(10, dtype=numpy_swapped_dtype) + + # Primitive type array, type is inferred from the numpy array + with pytest.raises(pa.ArrowNotImplementedError): + pa.array(np_arr) + + # Primitive type array, type is explicitly provided + with pytest.raises(pa.ArrowNotImplementedError): + pa.array(np_arr, type=pa.float64()) + + # List type array + with pytest.raises(pa.ArrowNotImplementedError): + pa.array([np_arr]) + + # Struct type array + with pytest.raises(pa.ArrowNotImplementedError): + pa.StructArray.from_arrays([np_arr], names=['a']) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 98cbd920b509b..d7dee1ad05e93 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -152,6 +152,7 @@ def test_option_class_equality(): pc.IndexOptions(pa.scalar(1)), pc.JoinOptions(), pc.ListSliceOptions(0, -1, 1, True), + pc.ListFlattenOptions(recursive=False), pc.MakeStructOptions(["field", "names"], field_nullability=[True, True], field_metadata=[pa.KeyValueMetadata({"a": "1"}), @@ -1344,6 +1345,11 @@ def test_filter_record_batch(): expected = pa.record_batch([pa.array(["a", "e"])], names=["a'"]) assert result.equals(expected) + # GH-38770: mask is chunked array + chunked_mask = pa.chunked_array([[True, False], [None], [False, True]]) + result = batch.filter(chunked_mask) + assert result.equals(expected) + result = batch.filter(mask, null_selection_behavior="emit_null") expected = pa.record_batch([pa.array(["a", None, "e"])], names=["a'"]) assert result.equals(expected) diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index 2a631db9fc0fa..0d8b4a152ab9f 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -142,6 +142,18 @@ def test_dataset_encryption_decryption(): assert table.equals(dataset.to_table()) + # set decryption properties for parquet fragment scan options + decryption_properties = crypto_factory.file_decryption_properties( + kms_connection_config, decryption_config) + pq_scan_opts = ds.ParquetFragmentScanOptions( + decryption_properties=decryption_properties + ) + + pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) + dataset = ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) + + assert table.equals(dataset.to_table()) + @pytest.mark.skipif( not encryption_unavailable, reason="Parquet Encryption is currently enabled" diff --git a/python/setup.py b/python/setup.py index 6f3dddb29d248..ed2b7961e5fbb 100755 --- a/python/setup.py +++ b/python/setup.py @@ -152,32 +152,20 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' - self.with_azure = strtobool( - os.environ.get('PYARROW_WITH_AZURE', '0')) - self.with_gcs = strtobool( - os.environ.get('PYARROW_WITH_GCS', '0')) - self.with_s3 = strtobool( - os.environ.get('PYARROW_WITH_S3', '0')) - self.with_hdfs = strtobool( - os.environ.get('PYARROW_WITH_HDFS', '0')) - self.with_cuda = strtobool( - os.environ.get('PYARROW_WITH_CUDA', '0')) - self.with_substrait = strtobool( - os.environ.get('PYARROW_WITH_SUBSTRAIT', '0')) - self.with_flight = strtobool( - os.environ.get('PYARROW_WITH_FLIGHT', '0')) - self.with_acero = strtobool( - os.environ.get('PYARROW_WITH_ACERO', '0')) - self.with_dataset = strtobool( - os.environ.get('PYARROW_WITH_DATASET', '0')) - self.with_parquet = strtobool( - os.environ.get('PYARROW_WITH_PARQUET', '0')) - self.with_parquet_encryption = strtobool( - os.environ.get('PYARROW_WITH_PARQUET_ENCRYPTION', '0')) - self.with_orc = strtobool( - os.environ.get('PYARROW_WITH_ORC', '0')) - self.with_gandiva = strtobool( - os.environ.get('PYARROW_WITH_GANDIVA', '0')) + self.with_azure = None + self.with_gcs = None + self.with_s3 = None + self.with_hdfs = None + self.with_cuda = None + self.with_substrait = None + self.with_flight = None + self.with_acero = None + self.with_dataset = None + self.with_parquet = None + self.with_parquet_encryption = None + self.with_orc = None + self.with_gandiva = None + self.generate_coverage = strtobool( os.environ.get('PYARROW_GENERATE_COVERAGE', '0')) self.bundle_arrow_cpp = strtobool( @@ -185,15 +173,6 @@ def initialize_options(self): self.bundle_cython_cpp = strtobool( os.environ.get('PYARROW_BUNDLE_CYTHON_CPP', '0')) - self.with_parquet_encryption = (self.with_parquet_encryption and - self.with_parquet) - - # enforce module dependencies - if self.with_substrait: - self.with_dataset = True - if self.with_dataset: - self.with_acero = True - CYTHON_MODULE_NAMES = [ 'lib', '_fs', @@ -270,23 +249,30 @@ def append_cmake_bool(value, varname): cmake_options.append('-D{0}={1}'.format( varname, 'on' if value else 'off')) + def append_cmake_component(flag, varname): + # only pass this to cmake is the user pass the --with-component + # flag to setup.py build_ext + if flag is not None: + append_cmake_bool(flag, varname) + if self.cmake_generator: cmake_options += ['-G', self.cmake_generator] - append_cmake_bool(self.with_cuda, 'PYARROW_BUILD_CUDA') - append_cmake_bool(self.with_substrait, 'PYARROW_BUILD_SUBSTRAIT') - append_cmake_bool(self.with_flight, 'PYARROW_BUILD_FLIGHT') - append_cmake_bool(self.with_gandiva, 'PYARROW_BUILD_GANDIVA') - append_cmake_bool(self.with_acero, 'PYARROW_BUILD_ACERO') - append_cmake_bool(self.with_dataset, 'PYARROW_BUILD_DATASET') - append_cmake_bool(self.with_orc, 'PYARROW_BUILD_ORC') - append_cmake_bool(self.with_parquet, 'PYARROW_BUILD_PARQUET') - append_cmake_bool(self.with_parquet_encryption, - 'PYARROW_BUILD_PARQUET_ENCRYPTION') - append_cmake_bool(self.with_azure, 'PYARROW_BUILD_AZURE') - append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') - append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') - append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') + append_cmake_component(self.with_cuda, 'PYARROW_CUDA') + append_cmake_component(self.with_substrait, 'PYARROW_SUBSTRAIT') + append_cmake_component(self.with_flight, 'PYARROW_FLIGHT') + append_cmake_component(self.with_gandiva, 'PYARROW_GANDIVA') + append_cmake_component(self.with_acero, 'PYARROW_ACERO') + append_cmake_component(self.with_dataset, 'PYARROW_DATASET') + append_cmake_component(self.with_orc, 'PYARROW_ORC') + append_cmake_component(self.with_parquet, 'PYARROW_PARQUET') + append_cmake_component(self.with_parquet_encryption, + 'PYARROW_PARQUET_ENCRYPTION') + append_cmake_component(self.with_azure, 'PYARROW_AZURE') + append_cmake_component(self.with_gcs, 'PYARROW_GCS') + append_cmake_component(self.with_s3, 'PYARROW_S3') + append_cmake_component(self.with_hdfs, 'PYARROW_HDFS') + append_cmake_bool(self.bundle_arrow_cpp, 'PYARROW_BUNDLE_ARROW_CPP') append_cmake_bool(self.bundle_cython_cpp, @@ -329,54 +315,8 @@ def append_cmake_bool(value, varname): self._found_names = [] for name in self.CYTHON_MODULE_NAMES: built_path = pjoin(install_prefix, name + ext_suffix) - if not os.path.exists(built_path): - print(f'Did not find {built_path}') - if self._failure_permitted(name): - print(f'Cython module {name} failure permitted') - continue - raise RuntimeError('PyArrow C-extension failed to build:', - os.path.abspath(built_path)) - - self._found_names.append(name) - - def _failure_permitted(self, name): - if name == '_parquet' and not self.with_parquet: - return True - if name == '_parquet_encryption' and not self.with_parquet_encryption: - return True - if name == '_orc' and not self.with_orc: - return True - if name == '_flight' and not self.with_flight: - return True - if name == '_substrait' and not self.with_substrait: - return True - if name == '_azurefs' and not self.with_azure: - return True - if name == '_gcsfs' and not self.with_gcs: - return True - if name == '_s3fs' and not self.with_s3: - return True - if name == '_hdfs' and not self.with_hdfs: - return True - if name == '_dataset' and not self.with_dataset: - return True - if name == '_acero' and not self.with_acero: - return True - if name == '_exec_plan' and not self.with_acero: - return True - if name == '_dataset_orc' and not ( - self.with_orc and self.with_dataset - ): - return True - if name == '_dataset_parquet' and not ( - self.with_parquet and self.with_dataset - ): - return True - if name == '_cuda' and not self.with_cuda: - return True - if name == 'gandiva' and not self.with_gandiva: - return True - return False + if os.path.exists(built_path): + self._found_names.append(name) def _get_build_dir(self): # Get the package directory from build_py diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 38cbaa94a3c25..bb4470e29037d 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -1,6 +1,6 @@ Package: arrow Title: Integration to 'Apache' 'Arrow' -Version: 16.0.0.9000 +Version: 16.1.0.9000 Authors@R: c( person("Neal", "Richardson", email = "neal.p.richardson@gmail.com", role = c("aut")), person("Ian", "Cook", email = "ianmcook@gmail.com", role = c("aut")), diff --git a/r/NEWS.md b/r/NEWS.md index 05f934dac68f3..47c4ac1571dad 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -17,7 +17,9 @@ under the License. --> -# arrow 16.0.0.9000 +# arrow 16.1.0.9000 + +# arrow 16.1.0 * R functions that users write that use functions that Arrow supports in dataset queries now can be used in queries too. Previously, only functions that used arithmetic operators worked. For example, `time_hours <- function(mins) mins / 60` worked, but `time_hours_rounded <- function(mins) round(mins / 60)` did not; now both work. These are automatic translations rather than true user-defined functions (UDFs); for UDFs, see `register_scalar_function()`. (#41223) * `summarize()` supports more complex expressions, and correctly handles cases where column names are reused in expressions. diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 7087a40c4903a..44dfbbcd5c7e7 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -48,10 +48,7 @@ supported_dplyr_methods <- list( group_vars = NULL, group_by_drop_default = NULL, ungroup = NULL, - mutate = c( - "window functions (e.g. things that require aggregation within groups)", - "not currently supported" - ), + mutate = NULL, transmute = NULL, arrange = NULL, rename = NULL, diff --git a/r/R/dplyr-arrange.R b/r/R/dplyr-arrange.R index f91cd14211e0f..c8594c77df000 100644 --- a/r/R/dplyr-arrange.R +++ b/r/R/dplyr-arrange.R @@ -47,6 +47,14 @@ arrange.arrow_dplyr_query <- function(.data, ..., .by_group = FALSE) { msg <- paste("Expression", names(sorts)[i], "not supported in Arrow") return(abandon_ship(call, .data, msg)) } + if (length(mask$.aggregations)) { + # dplyr lets you arrange on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in arrange() in Arrow") + return(abandon_ship(call, .data, msg)) + } descs[i] <- x[["desc"]] } .data$arrange_vars <- c(sorts, .data$arrange_vars) diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index 3aaa29696b8c8..211c26cecce8c 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -121,28 +121,9 @@ arrow_not_supported <- function(msg) { } # Create a data mask for evaluating a dplyr expression -arrow_mask <- function(.data, aggregation = FALSE) { +arrow_mask <- function(.data) { f_env <- new_environment(.cache$functions) - if (aggregation) { - # Add the aggregation functions to the environment, and set the enclosing - # environment to the parent frame so that, when called from summarize_eval(), - # they can reference and assign into `..aggregations` defined there. - pf <- parent.frame() - for (f in names(agg_funcs)) { - f_env[[f]] <- agg_funcs[[f]] - environment(f_env[[f]]) <- pf - } - } else { - # Add functions that need to error hard and clear. - # Some R functions will still try to evaluate on an Expression - # and return NA with a warning :exploding_head: - fail <- function(...) stop("Not implemented") - for (f in c("mean", "sd")) { - f_env[[f]] <- fail - } - } - # Assign the schema to the expressions schema <- .data$.data$schema walk(.data$selected_columns, ~ (.$schema <- schema)) @@ -156,6 +137,8 @@ arrow_mask <- function(.data, aggregation = FALSE) { # TODO: figure out what rlang::as_data_pronoun does/why we should use it # (because if we do we get `Error: Can't modify the data pronoun` in mutate()) out$.data <- .data$selected_columns + # Add the aggregations list to collect any that get pulled out when evaluating + out$.aggregations <- empty_named_list() out } diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index d85fa16af2e71..69decbd76655f 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -35,48 +35,24 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) } # tidy-eval the filter expressions inside an Arrow data_mask - filters <- lapply(expanded_filters, arrow_eval, arrow_mask(out)) - bad_filters <- map_lgl(filters, ~ inherits(., "try-error")) - if (any(bad_filters)) { - # This is similar to abandon_ship() except that the filter eval is - # vectorized, and we apply filters that _did_ work before abandoning ship - # with the rest - expr_labs <- map_chr(expanded_filters[bad_filters], format_expr) - if (query_on_dataset(out)) { - # Abort. We don't want to auto-collect if this is a Dataset because that - # could blow up, too big. - stop( - "Filter expression not supported for Arrow Datasets: ", - oxford_paste(expr_labs, quote = FALSE), - "\nCall collect() first to pull data into R.", - call. = FALSE - ) - } else { - arrow_errors <- map2_chr( - filters[bad_filters], expr_labs, - handle_arrow_not_supported - ) - if (length(arrow_errors) == 1) { - msg <- paste0(arrow_errors, "; ") - } else { - msg <- paste0("* ", arrow_errors, "\n", collapse = "") - } - warning( - msg, "pulling data into R", - immediate. = TRUE, - call. = FALSE - ) - # Set any valid filters first, then collect and then apply the invalid ones in R - out <- dplyr::collect(set_filters(out, filters[!bad_filters])) - if (by$from_by) { - out <- dplyr::ungroup(out) - } - return(dplyr::filter(out, !!!expanded_filters[bad_filters], .by = {{ .by }})) + mask <- arrow_mask(out) + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) + if (inherits(filt, "try-error")) { + msg <- handle_arrow_not_supported(filt, format_expr(expr)) + return(abandon_ship(match.call(), .data, msg)) + } + if (length(mask$.aggregations)) { + # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. + # TODO: add a test for this + msg <- paste("Expression", format_expr(expr), "not supported in filter() in Arrow") + return(abandon_ship(match.call(), .data, msg)) } + out <- set_filters(out, filt) } - out <- set_filters(out, filters) - if (by$from_by) { out$group_by_vars <- character() } diff --git a/r/R/dplyr-funcs-agg.R b/r/R/dplyr-funcs-agg.R index ab1df1d2f15a5..c0c4eb3089425 100644 --- a/r/R/dplyr-funcs-agg.R +++ b/r/R/dplyr-funcs-agg.R @@ -17,7 +17,7 @@ # Aggregation functions # -# These all insert into an ..aggregations list (in a parent frame) a list containing: +# These all insert into an .aggregations list in the mask, a list containing: # @param fun string function name # @param data list of 0 or more Expressions # @param options list of function options, as passed to call_function @@ -29,56 +29,56 @@ # you can use list_compute_functions("^hash_") register_bindings_aggregate <- function() { - register_binding_agg("base::sum", function(..., na.rm = FALSE) { + register_binding("base::sum", function(..., na.rm = FALSE) { set_agg( fun = "sum", data = ensure_one_arg(list2(...), "sum"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::prod", function(..., na.rm = FALSE) { + register_binding("base::prod", function(..., na.rm = FALSE) { set_agg( fun = "product", data = ensure_one_arg(list2(...), "prod"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::any", function(..., na.rm = FALSE) { + register_binding("base::any", function(..., na.rm = FALSE) { set_agg( fun = "any", data = ensure_one_arg(list2(...), "any"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::all", function(..., na.rm = FALSE) { + register_binding("base::all", function(..., na.rm = FALSE) { set_agg( fun = "all", data = ensure_one_arg(list2(...), "all"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::mean", function(x, na.rm = FALSE) { + register_binding("base::mean", function(x, na.rm = FALSE) { set_agg( fun = "mean", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("stats::sd", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::sd", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "stddev", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg("stats::var", function(x, na.rm = FALSE, ddof = 1) { + register_binding("stats::var", function(x, na.rm = FALSE, ddof = 1) { set_agg( fun = "variance", data = list(x), options = list(skip_nulls = na.rm, min_count = 0L, ddof = ddof) ) }) - register_binding_agg( + register_binding( "stats::quantile", function(x, probs, na.rm = FALSE) { if (length(probs) != 1) { @@ -103,7 +103,7 @@ register_bindings_aggregate <- function() { "approximate quantile (t-digest) is computed" ) ) - register_binding_agg( + register_binding( "stats::median", function(x, na.rm = FALSE) { # TODO: Bind to the Arrow function that returns an exact median and remove @@ -122,28 +122,28 @@ register_bindings_aggregate <- function() { }, notes = "approximate median (t-digest) is computed" ) - register_binding_agg("dplyr::n_distinct", function(..., na.rm = FALSE) { + register_binding("dplyr::n_distinct", function(..., na.rm = FALSE) { set_agg( fun = "count_distinct", data = ensure_one_arg(list2(...), "n_distinct"), options = list(na.rm = na.rm) ) }) - register_binding_agg("dplyr::n", function() { + register_binding("dplyr::n", function() { set_agg( fun = "count_all", data = list(), options = list() ) }) - register_binding_agg("base::min", function(..., na.rm = FALSE) { + register_binding("base::min", function(..., na.rm = FALSE) { set_agg( fun = "min", data = ensure_one_arg(list2(...), "min"), options = list(skip_nulls = na.rm, min_count = 0L) ) }) - register_binding_agg("base::max", function(..., na.rm = FALSE) { + register_binding("base::max", function(..., na.rm = FALSE) { set_agg( fun = "max", data = ensure_one_arg(list2(...), "max"), @@ -154,38 +154,38 @@ register_bindings_aggregate <- function() { set_agg <- function(...) { agg_data <- list2(...) - # Find the environment where ..aggregations is stored + # Find the environment where .aggregations is stored target <- find_aggregations_env() - aggs <- get("..aggregations", target) + aggs <- get(".aggregations", target) lapply(agg_data[["data"]], function(expr) { - # If any of the fields referenced in the expression are in ..aggregations, + # If any of the fields referenced in the expression are in .aggregations, # then we can't aggregate over them. # This is mainly for combinations of dataset columns and aggregations, # like sum(x - mean(x)), i.e. window functions. # This will reject (sum(sum(x)) as well, but that's not a useful operation. if (any(expr$field_names_in_expression() %in% names(aggs))) { - # TODO: support in ARROW-13926 arrow_not_supported("aggregate within aggregate expression") } }) - # Record the (fun, data, options) in ..aggregations + # Record the (fun, data, options) in .aggregations # and return a FieldRef pointing to it tmpname <- paste0("..temp", length(aggs)) aggs[[tmpname]] <- agg_data - assign("..aggregations", aggs, envir = target) + assign(".aggregations", aggs, envir = target) Expression$field_ref(tmpname) } find_aggregations_env <- function() { - # Find the environment where ..aggregations is stored, + # Find the environment where .aggregations is stored, # it's in parent.env of something in the call stack - for (f in sys.frames()) { - if (exists("..aggregations", envir = f)) { - return(f) + n <- 1 + while (TRUE) { + if (exists(".aggregations", envir = caller_env(n))) { + return(caller_env(n)) } + n <- n + 1 } - stop("Could not find ..aggregations") } ensure_one_arg <- function(args, fun) { diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index fda77bca83fc2..7f0627c33d010 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -53,7 +53,7 @@ #' * [`groups()`][dplyr::groups()] #' * [`inner_join()`][dplyr::inner_join()]: the `copy` argument is ignored #' * [`left_join()`][dplyr::left_join()]: the `copy` argument is ignored -#' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported +#' * [`mutate()`][dplyr::mutate()] #' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally. #' * [`relocate()`][dplyr::relocate()] #' * [`rename()`][dplyr::rename()] diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index abf2362d0107f..c0eb47e428b7f 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -22,8 +22,8 @@ NULL #' Register compute bindings #' -#' The `register_binding()` and `register_binding_agg()` functions -#' are used to populate a list of functions that operate on (and return) +#' `register_binding()` is used to populate a list of functions that operate on +#' (and return) #' Expressions. These are the basis for the `.data` mask inside dplyr methods. #' #' @section Writing bindings: @@ -40,26 +40,10 @@ NULL #' * Inside your function, you can call any other binding with `call_binding()`. #' #' @param fun_name A string containing a function name in the form `"function"` or -#' `"package::function"`. The package name is currently not used but -#' may be used in the future to allow these types of function calls. -#' @param fun A function or `NULL` to un-register a previous function. +#' `"package::function"`. +#' @param fun A function, or `NULL` to un-register a previous function. #' This function must accept `Expression` objects as arguments and return #' `Expression` objects instead of regular R objects. -#' @param agg_fun An aggregate function or `NULL` to un-register a previous -#' aggregate function. This function must accept `Expression` objects as -#' arguments and return a `list()` with components: -#' - `fun`: string function name -#' - `data`: list of 0 or more `Expression`s -#' - `options`: list of function options, as passed to call_function -#' @param update_cache Update .cache$functions at the time of registration. -#' the default is FALSE because the majority of usage is to register -#' bindings at package load, after which we create the cache once. The -#' reason why .cache$functions is needed in addition to nse_funcs for -#' non-aggregate functions could be revisited...it is currently used -#' as the data mask in mutate, filter, and aggregate (but not -#' summarise) because the data mask has to be a list. -#' @param registry An environment in which the functions should be -#' assigned. #' @param notes string for the docs: note any limitations or differences in #' behavior between the Arrow version and the R function. #' @return The previously registered binding or `NULL` if no previously @@ -67,12 +51,10 @@ NULL #' @keywords internal register_binding <- function(fun_name, fun, - registry = nse_funcs, - update_cache = FALSE, notes = character(0)) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] # if the unqualified name exists in the registry, warn if (!is.null(previous_fun) && !identical(fun, previous_fun)) { @@ -87,58 +69,25 @@ register_binding <- function(fun_name, # register both as `pkg::fun` and as `fun` if `qualified_name` is prefixed # unqualified_name and fun_name will be the same if not prefixed - registry[[unqualified_name]] <- fun - registry[[fun_name]] <- fun - + .cache$functions[[unqualified_name]] <- fun + .cache$functions[[fun_name]] <- fun .cache$docs[[fun_name]] <- notes - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- fun - fun_cache[[fun_name]] <- fun - .cache$functions <- fun_cache - } - invisible(previous_fun) } -unregister_binding <- function(fun_name, registry = nse_funcs, - update_cache = FALSE) { +unregister_binding <- function(fun_name) { unqualified_name <- sub("^.*?:{+}", "", fun_name) - previous_fun <- registry[[unqualified_name]] + previous_fun <- .cache$functions[[unqualified_name]] - rm( - list = unique(c(fun_name, unqualified_name)), - envir = registry, - inherits = FALSE - ) - - if (update_cache) { - fun_cache <- .cache$functions - fun_cache[[unqualified_name]] <- NULL - fun_cache[[fun_name]] <- NULL - .cache$functions <- fun_cache - } + .cache$functions[[unqualified_name]] <- NULL + .cache$functions[[fun_name]] <- NULL invisible(previous_fun) } -#' @rdname register_binding -#' @keywords internal -register_binding_agg <- function(fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0)) { - register_binding(fun_name, agg_fun, registry = registry, notes = notes) -} - # Supports functions and tests that call previously-defined bindings call_binding <- function(fun_name, ...) { - nse_funcs[[fun_name]](...) -} - -call_binding_agg <- function(fun_name, ...) { - agg_funcs[[fun_name]](...) + .cache$functions[[fun_name]](...) } create_binding_cache <- function() { @@ -147,7 +96,7 @@ create_binding_cache <- function() { # Register all available Arrow Compute functions, namespaced as arrow_fun. all_arrow_funs <- list_compute_functions() - arrow_funcs <- set_names( + .cache$functions <- set_names( lapply(all_arrow_funs, function(fun) { force(fun) function(...) Expression$create(fun, ...) @@ -155,7 +104,7 @@ create_binding_cache <- function() { paste0("arrow_", all_arrow_funs) ) - # Register bindings into nse_funcs and agg_funcs + # Register bindings into the cache register_bindings_array_function_map() register_bindings_aggregate() register_bindings_conditional() @@ -165,37 +114,17 @@ create_binding_cache <- function() { register_bindings_type() register_bindings_augmented() - # We only create the cache for nse_funcs and not agg_funcs - .cache$functions <- c(as.list(nse_funcs), arrow_funcs) -} - -# environments in the arrow namespace used in the above functions -nse_funcs <- new.env(parent = emptyenv()) -agg_funcs <- new.env(parent = emptyenv()) -.cache <- new.env(parent = emptyenv()) - -# we register 2 versions of the "::" binding - one for use with nse_funcs -# and another one for use with agg_funcs (registered in dplyr-funcs-agg.R) -nse_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) + .cache$functions[["::"]] <- function(lhs, rhs) { + lhs_name <- as.character(substitute(lhs)) + rhs_name <- as.character(substitute(rhs)) - fun_name <- paste0(lhs_name, "::", rhs_name) + fun_name <- paste0(lhs_name, "::", rhs_name) - # if we do not have a binding for pkg::fun, then fall back on to the - # regular pkg::fun function - nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + # if we do not have a binding for pkg::fun, then fall back on to the + # regular pkg::fun function + .cache$functions[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] + } } -agg_funcs[["::"]] <- function(lhs, rhs) { - lhs_name <- as.character(substitute(lhs)) - rhs_name <- as.character(substitute(rhs)) - - fun_name <- paste0(lhs_name, "::", rhs_name) - - # if we do not have a binding for pkg::fun, then fall back on to the - # nse_funcs (useful when we have a regular function inside an aggregating one) - # and then, if searching nse_funcs fails too, fall back to the - # regular `pkg::fun()` function - agg_funcs[[fun_name]] %||% nse_funcs[[fun_name]] %||% asNamespace(lhs_name)[[rhs_name]] -} +# environment in the arrow namespace used in the above functions +.cache <- new.env(parent = emptyenv()) diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R index 287532dee08a9..f0a8c005676df 100644 --- a/r/R/dplyr-mutate.R +++ b/r/R/dplyr-mutate.R @@ -45,17 +45,11 @@ mutate.arrow_dplyr_query <- function(.data, return(out) } - # Restrict the cases we support for now - has_aggregations <- any(unlist(lapply(exprs, all_funs)) %in% names(agg_funcs)) - if (has_aggregations) { - # ARROW-13926 - # mutate() on a grouped dataset does calculations within groups - # This doesn't matter on scalar ops (arithmetic etc.) but it does - # for things with aggregations (e.g. subtracting the mean) - return(abandon_ship(call, .data, "window functions not currently supported in Arrow")) - } - + # Create a mask with aggregation functions in it + # If there are any aggregations, we will need to compute them and + # and join the results back in, for "window functions" like x - mean(x) mask <- arrow_mask(out) + # Evaluate the mutate expressions results <- list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated @@ -81,6 +75,24 @@ mutate.arrow_dplyr_query <- function(.data, mask[[new_var]] <- mask$.data[[new_var]] <- results[[new_var]] } + if (length(mask$.aggregations)) { + # Make a copy of .data, do the aggregations on it, and then left_join on + # the group_by variables. + agg_query <- as_adq(.data) + # These may be computed by .by, make sure they're set + agg_query$group_by_vars <- grv + agg_query$aggregations <- mask$.aggregations + agg_query <- collapse.arrow_dplyr_query(agg_query) + if (length(grv)) { + out <- dplyr::left_join(out, agg_query, by = grv) + } else { + # If there are no group_by vars, add a scalar column to both and join on that + agg_query$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) + out$selected_columns[["..tempjoin"]] <- Expression$scalar(1L) + out <- dplyr::left_join(out, agg_query, by = "..tempjoin") + } + } + old_vars <- names(out$selected_columns) # Note that this is names(exprs) not names(results): # if results$new_var is NULL, that means we are supposed to remove it @@ -91,6 +103,11 @@ mutate.arrow_dplyr_query <- function(.data, out$selected_columns[[new_var]] <- results[[new_var]] } + # Prune any ..temp columns from the result, which would have come from + # .aggregations + temps <- grepl("^\\.\\.temp", names(out$selected_columns)) + out$selected_columns <- out$selected_columns[!temps] + # Deduplicate new_vars and remove NULL columns from new_vars new_vars <- intersect(union(new_vars, grv), names(out$selected_columns)) diff --git a/r/R/dplyr-summarize.R b/r/R/dplyr-summarize.R index 5bb81dc2b34fc..58ca849152a75 100644 --- a/r/R/dplyr-summarize.R +++ b/r/R/dplyr-summarize.R @@ -80,34 +80,32 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # ExecNode), and in the expressions, replace them with FieldRefs so that # further operations can happen (in what will become a ProjectNode that works # on the result of the Aggregate). - # To do this, we create a list in this function scope, and in arrow_mask(), - # and we make sure this environment here is the parent env of the binding - # functions, so that when they receive an expression, they can pull out - # aggregations and insert them into the list, which they can find because it - # is in the parent env. + # To do this, arrow_mask() includes a list called .aggregations, + # and the aggregation functions will pull out those terms and insert into + # that list. # nolint end - ..aggregations <- empty_named_list() - - # We'll collect any transformations after the aggregation here - ..post_mutate <- empty_named_list() - mask <- arrow_mask(.data, aggregation = TRUE) + mask <- arrow_mask(.data) + # We'll collect any transformations after the aggregation here. + # summarize_eval() returns NULL when the outer expression is an aggregation, + # i.e. there is no projection to do after + post_mutate <- empty_named_list() for (i in seq_along(exprs)) { # Iterate over the indices and not the names because names may be repeated # (which overwrites the previous name) name <- names(exprs)[i] - ..post_mutate[[name]] <- summarize_eval(name, exprs[[i]], mask) + post_mutate[[name]] <- summarize_eval(name, exprs[[i]], mask) } # Apply the results to the .data object. # First, the aggregations - .data$aggregations <- ..aggregations + .data$aggregations <- mask$.aggregations # Then collapse the query so that the resulting query object can have # additional operations applied to it out <- collapse.arrow_dplyr_query(.data) - # Now, add the projections in ..post_mutate (if any) - for (post in names(..post_mutate)) { + # Now, add the projections in post_mutate (if any) + for (post in names(post_mutate)) { # One last check: it's possible that an expression like y - mean(y) would # successfully evaluate, but it's not supported. It gets transformed to: # nolint start @@ -121,7 +119,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { # We can tell the expression is invalid if it references fields not in # the schema of the data after summarize(). Evaulating its type will # throw an error if it's invalid. - tryCatch(..post_mutate[[post]]$type(out$.data$schema), error = function(e) { + tryCatch(post_mutate[[post]]$type(out$.data$schema), error = function(e) { msg <- paste( "Expression", as_label(exprs[[post]]), "is not a valid aggregation expression or is" @@ -129,7 +127,7 @@ do_arrow_summarize <- function(.data, ..., .groups = NULL) { arrow_not_supported(msg) }) # If it's valid, add it to the .data object - out$selected_columns[[post]] <- ..post_mutate[[post]] + out$selected_columns[[post]] <- post_mutate[[post]] } # Make sure column order is correct (and also drop ..temp columns) @@ -266,10 +264,10 @@ format_aggregation <- function(x) { # This function evaluates an expression and returns the post-summarize # projection that results, or NULL if there is none because the top-level # expression was an aggregation. Any aggregations are pulled out and collected -# in the ..aggregations list outside this function. +# in the .aggregations list outside this function. summarize_eval <- function(name, quosure, mask) { # Add previous aggregations to the mask, so they can be referenced - for (n in names(get("..aggregations", parent.frame()))) { + for (n in names(mask$.aggregations)) { mask[[n]] <- mask$.data[[n]] <- Expression$field_ref(n) } # Evaluate: @@ -286,14 +284,11 @@ summarize_eval <- function(name, quosure, mask) { # Handle case where outer expr is ..temp field ref. This came from an # aggregation at the top level. So the resulting name should be `name`. # not `..tempN`. Rename the corresponding aggregation. - post_aggs <- get("..aggregations", parent.frame()) result_field_name <- value$field_name - if (result_field_name %in% names(post_aggs)) { + if (result_field_name %in% names(mask$.aggregations)) { # Do this by assigning over `name` in case something else was in `name` - post_aggs[[name]] <- post_aggs[[result_field_name]] - post_aggs[[result_field_name]] <- NULL - # Assign back into the parent environment - assign("..aggregations", post_aggs, parent.frame()) + mask$.aggregations[[name]] <- mask$.aggregations[[result_field_name]] + mask$.aggregations[[result_field_name]] <- NULL # Return NULL because there is no post-mutate projection, it's just # the aggregation return(NULL) diff --git a/r/R/udf.R b/r/R/udf.R index 922095cceba6a..0415fbac3c9fc 100644 --- a/r/R/udf.R +++ b/r/R/udf.R @@ -95,12 +95,7 @@ register_scalar_function <- function(name, fun, in_type, out_type, body(binding_fun) <- expr_substitute(body(binding_fun), sym("name"), name) environment(binding_fun) <- asNamespace("arrow") - register_binding( - name, - binding_fun, - update_cache = TRUE - ) - + register_binding(name, binding_fun) invisible(NULL) } diff --git a/r/man/acero.Rd b/r/man/acero.Rd index ca51ef56334eb..9ef9cd7dda6fb 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -40,7 +40,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:group_data]{groups()}} \item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} argument is ignored -\item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported +\item \code{\link[dplyr:mutate]{mutate()}} \item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally. \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} diff --git a/r/man/register_binding.Rd b/r/man/register_binding.Rd index d10cd733bbe9d..b84cde3b8993a 100644 --- a/r/man/register_binding.Rd +++ b/r/man/register_binding.Rd @@ -2,63 +2,28 @@ % Please edit documentation in R/dplyr-funcs.R \name{register_binding} \alias{register_binding} -\alias{register_binding_agg} \title{Register compute bindings} \usage{ -register_binding( - fun_name, - fun, - registry = nse_funcs, - update_cache = FALSE, - notes = character(0) -) - -register_binding_agg( - fun_name, - agg_fun, - registry = agg_funcs, - notes = character(0) -) +register_binding(fun_name, fun, notes = character(0)) } \arguments{ \item{fun_name}{A string containing a function name in the form \code{"function"} or -\code{"package::function"}. The package name is currently not used but -may be used in the future to allow these types of function calls.} +\code{"package::function"}.} -\item{fun}{A function or \code{NULL} to un-register a previous function. +\item{fun}{A function, or \code{NULL} to un-register a previous function. This function must accept \code{Expression} objects as arguments and return \code{Expression} objects instead of regular R objects.} -\item{registry}{An environment in which the functions should be -assigned.} - -\item{update_cache}{Update .cache$functions at the time of registration. -the default is FALSE because the majority of usage is to register -bindings at package load, after which we create the cache once. The -reason why .cache$functions is needed in addition to nse_funcs for -non-aggregate functions could be revisited...it is currently used -as the data mask in mutate, filter, and aggregate (but not -summarise) because the data mask has to be a list.} - \item{notes}{string for the docs: note any limitations or differences in behavior between the Arrow version and the R function.} - -\item{agg_fun}{An aggregate function or \code{NULL} to un-register a previous -aggregate function. This function must accept \code{Expression} objects as -arguments and return a \code{list()} with components: -\itemize{ -\item \code{fun}: string function name -\item \code{data}: list of 0 or more \code{Expression}s -\item \code{options}: list of function options, as passed to call_function -}} } \value{ The previously registered binding or \code{NULL} if no previously registered function existed. } \description{ -The \code{register_binding()} and \code{register_binding_agg()} functions -are used to populate a list of functions that operate on (and return) +\code{register_binding()} is used to populate a list of functions that operate on +(and return) Expressions. These are the basis for the \code{.data} mask inside dplyr methods. } \section{Writing bindings}{ diff --git a/r/pkgdown/assets/versions.json b/r/pkgdown/assets/versions.json index 75d179f240515..43f0b3fac62a1 100644 --- a/r/pkgdown/assets/versions.json +++ b/r/pkgdown/assets/versions.json @@ -1,10 +1,10 @@ [ { - "name": "16.0.0.9000 (dev)", + "name": "16.1.0.9000 (dev)", "version": "dev/" }, { - "name": "16.0.0 (release)", + "name": "16.1.0 (release)", "version": "" }, { diff --git a/r/src/extension-impl.cpp b/r/src/extension-impl.cpp index a13b252b2832f..14c771cc98e4f 100644 --- a/r/src/extension-impl.cpp +++ b/r/src/extension-impl.cpp @@ -87,7 +87,9 @@ arrow::Result> RExtensionType::Deserialize( return std::shared_ptr(cloned.release()); } -std::string RExtensionType::ToString() const { +std::string RExtensionType::ToString() const { return ToString(false); } + +std::string RExtensionType::ToString(bool show_metadata) const { arrow::Result result = SafeCallIntoR([&]() { cpp11::environment instance = r6_instance(); cpp11::function instance_ToString(instance["ToString"]); @@ -98,7 +100,11 @@ std::string RExtensionType::ToString() const { // In the event of an error (e.g., we are not on the main thread // and we are not inside RunWithCapturedR()), just call the default method if (!result.ok()) { +#if ARROW_VERSION_MAJOR >= 16 + return ExtensionType::ToString(show_metadata); +#else return ExtensionType::ToString(); +#endif } else { return result.ValueUnsafe(); } diff --git a/r/src/extension.h b/r/src/extension.h index fbd3ad484691a..6e6c6f7c29761 100644 --- a/r/src/extension.h +++ b/r/src/extension.h @@ -52,6 +52,8 @@ class RExtensionType : public arrow::ExtensionType { std::string Serialize() const { return extension_metadata_; } + std::string ToString(bool show_metadata = false) const; + // wrapper for libarrow < 16 std::string ToString() const; cpp11::sexp Convert(const std::shared_ptr& array) const; diff --git a/r/tests/testthat/test-dataset-dplyr.R b/r/tests/testthat/test-dataset-dplyr.R index b8d93841921d7..493eac328e5cd 100644 --- a/r/tests/testthat/test-dataset-dplyr.R +++ b/r/tests/testthat/test-dataset-dplyr.R @@ -163,17 +163,6 @@ See $.data for the source Arrow object", ) }) -test_that("mutate() features not yet implemented", { - ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) - expect_error( - ds %>% - group_by(int) %>% - mutate(avg = mean(int)), - "window functions not currently supported in Arrow\nCall collect() first to pull data into R.", - fixed = TRUE - ) -}) - test_that("filter scalar validation doesn't crash (ARROW-7772)", { ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8())) expect_error( @@ -336,7 +325,7 @@ test_that("dplyr method not implemented messages", { # This one is more nuanced expect_error( ds %>% filter(int > 6, dbl > max(dbl)), - "Filter expression not supported for Arrow Datasets: dbl > max(dbl)\nCall collect() first to pull data into R.", + "Expression dbl > max(dbl) not supported in filter() in Arrow\nCall collect() first to pull data into R.", fixed = TRUE ) }) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index bf23685362a82..535bcb70c4cab 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -324,13 +324,14 @@ test_that("Filtering with unsupported functions", { filter( nchar(chr, type = "bytes", allowNA = TRUE) == 1, # bad, Arrow msg int > 2, # good - pnorm(dbl) > .99 # bad, opaque + pnorm(dbl) > .99 # bad, opaque, but we'll error on the first one before we get here ) %>% collect(), tbl, - warning = '\\* In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1, allowNA = TRUE not supported in Arrow -\\* Expression pnorm\\(dbl\\) > 0.99 not supported in Arrow -pulling data into R' + warning = paste( + 'In nchar\\(chr, type = "bytes", allowNA = TRUE\\) == 1,', + "allowNA = TRUE not supported in Arrow; pulling data into R" + ) ) }) diff --git a/r/tests/testthat/test-dplyr-funcs.R b/r/tests/testthat/test-dplyr-funcs.R index 039604a85ee0c..48c5d730f8493 100644 --- a/r/tests/testthat/test-dplyr-funcs.R +++ b/r/tests/testthat/test-dplyr-funcs.R @@ -19,35 +19,25 @@ skip_on_cran() test_that("register_binding()/unregister_binding() works", { - fake_registry <- new.env(parent = emptyenv()) fun1 <- function() NULL fun2 <- function() "Hello" - expect_null(register_binding("some.pkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`some.pkg::some_fun`, fun1) + expect_null(register_binding("some.pkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) + expect_identical(.cache$functions$`some.pkg::some_fun`, fun1) - expect_identical(unregister_binding("some.pkg::some_fun", fake_registry), fun1) - expect_false("some.pkg::some_fun" %in% names(fake_registry)) - expect_false("some_fun" %in% names(fake_registry)) + expect_identical(unregister_binding("some.pkg::some_fun"), fun1) + expect_false("some.pkg::some_fun" %in% names(.cache$functions)) + expect_false("some_fun" %in% names(.cache$functions)) - expect_null(register_binding("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) + expect_null(register_binding("somePkg::some_fun", fun1)) + expect_identical(.cache$functions$some_fun, fun1) expect_warning( - register_binding("some.pkg2::some_fun", fun2, fake_registry), + register_binding("some.pkg2::some_fun", fun2), "A \"some_fun\" binding already exists in the registry and will be overwritten." ) # No warning when an identical function is re-registered - expect_silent(register_binding("some.pkg2::some_fun", fun2, fake_registry)) -}) - -test_that("register_binding_agg() works", { - fake_registry <- new.env(parent = emptyenv()) - fun1 <- function() NULL - - expect_null(register_binding_agg("somePkg::some_fun", fun1, fake_registry)) - expect_identical(fake_registry$some_fun, fun1) - expect_identical(fake_registry$`somePkg::some_fun`, fun1) + expect_silent(register_binding("some.pkg2::some_fun", fun2)) }) diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R index 0889fffedd508..71c1e52d33c1d 100644 --- a/r/tests/testthat/test-dplyr-mutate.R +++ b/r/tests/testthat/test-dplyr-mutate.R @@ -378,18 +378,16 @@ test_that("dplyr::mutate's examples", { # The mutate operation may yield different results on grouped # tibbles because the expressions are computed within groups. # The following normalises `mass` by the global average: - # TODO(ARROW-13926): support window functions compare_dplyr_binding( .input %>% select(name, mass, species) %>% mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) %>% collect(), - starwars, - warning = "window function" + starwars ) }) -test_that("Can mutate after group_by as long as there are no aggregations", { +test_that("Can mutate after group_by, including with some aggregations", { compare_dplyr_binding( .input %>% select(int, chr) %>% @@ -417,31 +415,31 @@ test_that("Can mutate after group_by as long as there are no aggregations", { collect(), tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(int, chr) %>% group_by(chr) %>% mutate(avg_int = mean(int)) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(mean = int, chr) %>% # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that # `all_funs()` detects `mean()` despite the collision with a column name group_by(chr) %>% mutate(avg_int = mean(mean)) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) }) -test_that("Can mutate with .by argument as long as there are no aggregations", { +test_that("Can mutate with .by argument, even with some aggregations", { compare_dplyr_binding( .input %>% select(int, chr) %>% @@ -479,25 +477,25 @@ test_that("Can mutate with .by argument as long as there are no aggregations", { collect(), tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(int, chr) %>% mutate(avg_int = mean(int), .by = chr) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) - expect_warning( - tbl %>% - Table$create() %>% + compare_dplyr_binding( + .input %>% select(mean = int, chr) %>% # rename `int` to `mean` and use `mean(mean)` in `mutate()` to test that # `all_funs()` detects `mean()` despite the collision with a column name mutate(avg_int = mean(mean), .by = chr) %>% + # Because this silently does a join, the rows can get unsorted + arrange(chr) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + tbl ) }) @@ -682,7 +680,6 @@ test_that("mutate() and transmute() with namespaced functions", { }) test_that("Can use across() within mutate()", { - # expressions work in the right order compare_dplyr_binding( .input %>% @@ -717,17 +714,15 @@ test_that("Can use across() within mutate()", { example_data ) - # gives the right error with window functions - expect_warning( - arrow_table(example_data) %>% + compare_dplyr_binding( + .input %>% mutate( x = int + 2, across(c("int", "dbl"), list(mean = mean, sd = sd, round)), exp(dbl2) ) %>% collect(), - "window functions not currently supported in Arrow; pulling data into R", - fixed = TRUE + example_data ) }) diff --git a/r/tests/testthat/test-dplyr-summarize.R b/r/tests/testthat/test-dplyr-summarize.R index 87bb5e5fac959..a61ef95bee73d 100644 --- a/r/tests/testthat/test-dplyr-summarize.R +++ b/r/tests/testthat/test-dplyr-summarize.R @@ -337,20 +337,20 @@ test_that("Functions that take ... but we only accept a single arg", { ) # Now that we've demonstrated that the whole machinery works, let's test - # the agg_funcs directly - expect_error(call_binding_agg("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("sum"), "sum() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("prod"), "prod() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("any"), "any() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("all"), "all() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("min"), "min() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("max"), "max() with 0 arguments", fixed = TRUE) - expect_error(call_binding_agg("n_distinct", 1, 2), "Multiple arguments to n_distinct()") - expect_error(call_binding_agg("sum", 1, 2), "Multiple arguments to sum") - expect_error(call_binding_agg("any", 1, 2), "Multiple arguments to any()") - expect_error(call_binding_agg("all", 1, 2), "Multiple arguments to all()") - expect_error(call_binding_agg("min", 1, 2), "Multiple arguments to min()") - expect_error(call_binding_agg("max", 1, 2), "Multiple arguments to max()") + # the agg funcs directly + expect_error(call_binding("n_distinct"), "n_distinct() with 0 arguments", fixed = TRUE) + expect_error(call_binding("sum"), "sum() with 0 arguments", fixed = TRUE) + expect_error(call_binding("prod"), "prod() with 0 arguments", fixed = TRUE) + expect_error(call_binding("any"), "any() with 0 arguments", fixed = TRUE) + expect_error(call_binding("all"), "all() with 0 arguments", fixed = TRUE) + expect_error(call_binding("min"), "min() with 0 arguments", fixed = TRUE) + expect_error(call_binding("max"), "max() with 0 arguments", fixed = TRUE) + expect_error(call_binding("n_distinct", 1, 2), "Multiple arguments to n_distinct()") + expect_error(call_binding("sum", 1, 2), "Multiple arguments to sum") + expect_error(call_binding("any", 1, 2), "Multiple arguments to any()") + expect_error(call_binding("all", 1, 2), "Multiple arguments to all()") + expect_error(call_binding("min", 1, 2), "Multiple arguments to min()") + expect_error(call_binding("max", 1, 2), "Multiple arguments to max()") }) test_that("median()", { diff --git a/r/tests/testthat/test-udf.R b/r/tests/testthat/test-udf.R index 0eb75b1dde6e5..8604dc610a435 100644 --- a/r/tests/testthat/test-udf.R +++ b/r/tests/testthat/test-udf.R @@ -90,7 +90,7 @@ test_that("register_scalar_function() adds a compute function to the registry", int32(), float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_true("times_32" %in% names(asNamespace("arrow")$.cache$functions)) expect_true("times_32" %in% list_compute_functions()) @@ -124,7 +124,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_array", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_array")) expect_error( call_function("times_32_bad_return_type_array", Array$create(1L)), @@ -137,7 +137,7 @@ test_that("arrow_scalar_function() with bad return type errors", { int32(), float64() ) - on.exit(unregister_binding("times_32_bad_return_type_scalar", update_cache = TRUE)) + on.exit(unregister_binding("times_32_bad_return_type_scalar")) expect_error( call_function("times_32_bad_return_type_scalar", Array$create(1L)), @@ -155,7 +155,7 @@ test_that("register_scalar_function() can register multiple kernels", { out_type = function(in_types) in_types[[1]], auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) expect_equal( call_function("times_32", Scalar$create(1L, int32())), @@ -238,7 +238,7 @@ test_that("user-defined functions work during multi-threaded execution", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) # check a regular collect() result <- open_dataset(tf_dataset) %>% @@ -271,7 +271,7 @@ test_that("nested exec plans can contain user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) stream_plan_with_udf <- function() { record_batch(a = 1:1000) %>% @@ -310,7 +310,7 @@ test_that("head() on exec plan containing user-defined functions", { float64(), auto_convert = TRUE ) - on.exit(unregister_binding("times_32", update_cache = TRUE)) + on.exit(unregister_binding("times_32")) result <- record_batch(a = 1:1000) %>% dplyr::mutate(b = times_32(a)) %>% diff --git a/r/tools/nixlibs.R b/r/tools/nixlibs.R index 0af41888b95b7..def4d35f825be 100644 --- a/r/tools/nixlibs.R +++ b/r/tools/nixlibs.R @@ -386,9 +386,7 @@ distro <- function() { out$id <- tolower(out$id) # debian unstable & testing lsb_release `version` don't include numbers but we can map from pretty name if (is.null(out$version) || out$version %in% c("testing", "unstable")) { - if (grepl("bullseye", out$codename)) { - out$short_version <- "11" - } else if (grepl("bookworm", out$codename)) { + if (grepl("bookworm", out$codename)) { out$short_version <- "12" } } else if (out$id == "ubuntu") { diff --git a/r/tools/test-nixlibs.R b/r/tools/test-nixlibs.R index 6996f234ced45..02e822c3420c8 100644 --- a/r/tools/test-nixlibs.R +++ b/r/tools/test-nixlibs.R @@ -158,10 +158,6 @@ test_that("check_allowlist", { }) test_that("find_latest_nightly()", { - skip_if( - getRversion() > "4.4.0", - "long last version components (>8) fail to max on r-devel" - ) tf <- tempfile() tf_uri <- paste0("file://", tf) on.exit(unlink(tf)) diff --git a/r/tools/ubsan.supp b/r/tools/ubsan.supp index ff88cf984136b..34854e79bcbf9 100644 --- a/r/tools/ubsan.supp +++ b/r/tools/ubsan.supp @@ -16,3 +16,4 @@ # under the License. vptr:include/c++/8/bits/shared_ptr_base.h +function:cleancall.c \ No newline at end of file diff --git a/r/vignettes/data_wrangling.Rmd b/r/vignettes/data_wrangling.Rmd index 305a91c156eb1..1d074ef0cfedb 100644 --- a/r/vignettes/data_wrangling.Rmd +++ b/r/vignettes/data_wrangling.Rmd @@ -165,33 +165,7 @@ sw2 %>% transmute(name, height, mass, res = residuals(lm(mass ~ height))) ``` -Because window functions are not supported, computing an aggregation like `mean()` on a grouped table or within a rowwise operation like `filter()` is not supported: - -```{r} -sw %>% - select(1:4) %>% - filter(!is.na(hair_color)) %>% - group_by(hair_color) %>% - filter(height < mean(height, na.rm = TRUE)) -``` - -This operation is sometimes referred to as a windowed aggregate and can be accomplished in Arrow by computing the aggregation separately, for example within a join operation: - -```{r} -sw %>% - select(1:4) %>% - filter(!is.na(hair_color)) %>% - left_join( - sw %>% - group_by(hair_color) %>% - summarize(mean_height = mean(height, na.rm = TRUE)) - ) %>% - filter(height < mean_height) %>% - select(!mean_height) %>% - collect() -``` - -Alternatively, [DuckDB](https:\www.duckdb.org) supports Arrow natively, so you can pass the `Table` object to DuckDB without paying a performance penalty using the helper function `to_duckdb()` and pass the object back to Arrow with `to_arrow()`: +For some operations, you can use [DuckDB](https://www.duckdb.org). It supports Arrow natively, so you can pass the `Dataset` or query object to DuckDB without paying a performance penalty using the helper function `to_duckdb()` and pass the object back to Arrow with `to_arrow()`: ```{r} sw %>% diff --git a/r/vignettes/developers/writing_bindings.Rmd b/r/vignettes/developers/writing_bindings.Rmd index 443211b3c2b5e..e1ed92105dbc3 100644 --- a/r/vignettes/developers/writing_bindings.Rmd +++ b/r/vignettes/developers/writing_bindings.Rmd @@ -145,11 +145,10 @@ test_that("startsWith behaves identically in dplyr and Arrow", { df <- tibble(x = c("Foo", "bar", "baz", "qux")) compare_dplyr_binding( .input %>% - filter(startsWith(x, "b")) %>% - collect(), + filter(startsWith(x, "b")) %>% + collect(), df ) - }) ``` @@ -197,7 +196,7 @@ As `startsWith()` requires options, direct mapping is not appropriate. If the function cannot be mapped directly, some extra work may be needed to ensure that calling the arrow version of the function results in the same result as calling the R version of the function. In this case, the function will need -adding to the `nse_funcs` function registry. Here is how this might look for +adding to the `.cache$functions` function registry. Here is how this might look for `startsWith()`: ```{r, eval = FALSE} diff --git a/r/vignettes/install.Rmd b/r/vignettes/install.Rmd index cc90c5ff08c60..c7b8251ccc99b 100644 --- a/r/vignettes/install.Rmd +++ b/r/vignettes/install.Rmd @@ -28,35 +28,6 @@ For `gcc`, this generally means version 7 or newer. Most contemporary Linux distributions have a new enough compiler; however, CentOS 7 is a notable exception, as it ships with gcc 4.8. -If you are on CentOS 7, to build arrow you will need to install a newer `devtoolset`, and you'll need to update R's Makevars to define the `CXX17` variables. This script installs `devtoolset-8` and configures R to be able to use C++17: - -``` -#!/usr/bin/env bash - -yum install -y centos-release-scl -yum install -y devtoolset-8 -# Optional: also install cloud storage dependencies, as described below -yum install -y libcurl-devel openssl-devel - -source /opt/rh/devtoolset-8/enable - -if [ ! `R CMD config CXX17` ]; then - mkdir -p ~/.R - echo "CC = $(which gcc) -fPIC" >> ~/.R/Makevars - echo "CXX17 = $(which g++) -fPIC" >> ~/.R/Makevars - echo "CXX17STD = -std=c++17" >> ~/.R/Makevars - echo "CXX17FLAGS = ${CXX11FLAGS}" >> ~/.R/Makevars -fi -``` - -Note that the C++17 compiler is only required at *build* time. You don't need -to enable the devtoolset every time you load the package. What's more, if you -install a binary package from RStudio Package Manager (see method 1a below), you -do not need to set up any of this. Likewise, if you `R CMD INSTALL --build` -arrow on a CentOS machine with the newer compilers, you can take the binary -package it produces and install it on any other CentOS machine without those -compilers. - ### Libraries Optional support for reading from cloud storage--AWS S3 and @@ -517,10 +488,6 @@ The install script should work everywhere, so if libarrow fails to compile, please [report an issue](https://issues.apache.org/jira/projects/ARROW/issues) so that we can improve the script. -### Known installation issues - -* On CentOS, building the package requires a more modern `devtoolset` than the default system compilers. See "System dependencies" above. - ## Contributing We are constantly working to make the installation process as painless as