diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 9bd42dbaa0d6..9c4cda5d034d 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -7,3 +7,9 @@ updates: open-pull-requests-limit: 10 target-branch: master labels: [auto-dependencies] + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + open-pull-requests-limit: 10 + labels: [auto-dependencies] diff --git a/.github/workflows/arrow.yml b/.github/workflows/arrow.yml index bdca76f18bf6..fcc4d2c371b2 100644 --- a/.github/workflows/arrow.yml +++ b/.github/workflows/arrow.yml @@ -24,6 +24,9 @@ on: branches: - master pull_request: + paths: + - arrow/** + - .github/** jobs: @@ -38,7 +41,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -48,9 +51,9 @@ jobs: - name: Test run: | cargo test -p arrow - - name: Test --features=force_validate,prettyprint + - name: Test --features=force_validate,prettyprint,ipc_compression,ffi run: | - cargo test -p arrow --features=force_validate,prettyprint + cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi - name: Run examples run: | # Test arrow examples @@ -70,7 +73,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -104,7 +107,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -133,7 +136,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Cache Cargo @@ -150,8 +153,8 @@ jobs: - name: Build run: | cd arrow - cargo build --no-default-features --features=csv,ipc,simd --target wasm32-unknown-unknown - cargo build --no-default-features --features=csv,ipc,simd --target wasm32-wasi + cargo build --no-default-features --features=csv,ipc,simd,ffi --target wasm32-unknown-unknown + cargo build --no-default-features --features=csv,ipc,simd,ffi --target wasm32-wasi clippy: name: Clippy @@ -159,7 +162,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -169,4 +172,4 @@ jobs: rustup component add clippy - name: Run clippy run: | - cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils --all-targets -- -D warnings + cargo clippy -p arrow --features=prettyprint,csv,ipc,test_utils,ffi,ipc_compression --all-targets -- -D warnings diff --git a/.github/workflows/arrow_flight.yml b/.github/workflows/arrow_flight.yml index 5e5538121164..86a67ff9a6a4 100644 --- a/.github/workflows/arrow_flight.yml +++ b/.github/workflows/arrow_flight.yml @@ -43,7 +43,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -63,7 +63,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: diff --git a/.github/workflows/cancel.yml b/.github/workflows/cancel.yml index 1cf7a1356037..a98c8ee5d225 100644 --- a/.github/workflows/cancel.yml +++ b/.github/workflows/cancel.yml @@ -16,7 +16,7 @@ # under the License. # Attempt to cancel stale workflow runs to save github actions runner time -name: Cancel stale runs +name: cancel on: workflow_run: diff --git a/.github/workflows/comment_bot.yml b/.github/workflows/comment_bot.yml deleted file mode 100644 index 6ca095328af1..000000000000 --- a/.github/workflows/comment_bot.yml +++ /dev/null @@ -1,72 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -name: Comment Bot - -on: - # TODO(kszucs): support pull_request_review_comment - issue_comment: - types: - - created - - edited - -jobs: - crossbow: - name: Listen! - if: startsWith(github.event.comment.body, '@github-actions crossbow') - runs-on: ubuntu-latest - steps: - - name: Checkout Arrow - uses: actions/checkout@v2 - with: - repository: apache/arrow - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Install Archery and Crossbow dependencies - run: pip install -e dev/archery[bot] - - name: Handle Github comment event - env: - ARROW_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - CROSSBOW_GITHUB_TOKEN: ${{ secrets.CROSSBOW_GITHUB_TOKEN }} - run: | - archery trigger-bot \ - --event-name ${{ github.event_name }} \ - --event-payload ${{ github.event_path }} - - rebase: - name: "Rebase" - if: startsWith(github.event.comment.body, '@github-actions rebase') - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: r-lib/actions/pr-fetch@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Rebase on ${{ github.repository }} master - run: | - set -ex - git config user.name "$(git log -1 --pretty=format:%an)" - git config user.email "$(git log -1 --pretty=format:%ae)" - git remote add upstream https://github.com/${{ github.repository }} - git fetch --unshallow upstream master - git rebase upstream/master - - uses: r-lib/actions/pr-push@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - args: "--force" diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml new file mode 100644 index 000000000000..e688428e187c --- /dev/null +++ b/.github/workflows/coverage.yml @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: coverage + +# Trigger only on pushes to master, not pull requests +on: + push: + branches: + - master + +jobs: + + coverage: + name: Coverage + runs-on: ubuntu-latest + # Note runs outside of a container + # otherwise we get this error: + # Failed to run tests: ASLR disable failed: EPERM: Operation not permitted + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Setup Rust toolchain + run: | + rustup toolchain install stable + rustup default stable + - name: Install protobuf compiler in /protoc + run: | + sudo mkdir /protoc + sudo chmod a+rwx /protoc + cd /protoc + curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protoc-21.4-linux-x86_64.zip + unzip protoc-21.4-linux-x86_64.zip + - name: Cache Cargo + uses: actions/cache@v3 + with: + path: /home/runner/.cargo + key: cargo-coverage-cache3- + - name: Run coverage + run: | + export PATH=$PATH:/protoc/bin + rustup toolchain install stable + rustup default stable + cargo install --version 0.18.2 cargo-tarpaulin + cargo tarpaulin --all --out Xml + - name: Report coverage + continue-on-error: true + run: bash <(curl -s https://codecov.io/bash) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index 21263a9211e1..57dc19482761 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Dev +name: dev # trigger for all PRs and changes to master on: @@ -34,24 +34,24 @@ jobs: name: Release Audit Tool (RAT) runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: python-version: 3.8 - name: Audit licenses run: ./dev/release/run-rat.sh . prettier: - name: Use prettier to check formatting of markdown documents + name: Markdown format runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-node@v2 + - uses: actions/checkout@v3 + - uses: actions/setup-node@v3 with: node-version: "14" - name: Prettier check run: | - # if you encounter error, try rerun the command below with --write instead of --check - # and commit the changes - npx prettier@2.3.0 --check {arrow,arrow-flight,dev,integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md + # if you encounter error, run the command below and commit the changes + npx prettier@2.3.2 --write {arrow,arrow-flight,dev,integration-testing,parquet}/**/*.md README.md CODE_OF_CONDUCT.md CONTRIBUTING.md + git diff --exit-code diff --git a/.github/workflows/dev_pr.yml b/.github/workflows/dev_pr.yml index 093d376713d8..64f7ecc0039f 100644 --- a/.github/workflows/dev_pr.yml +++ b/.github/workflows/dev_pr.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Dev PR +name: dev_pr # Trigger whenever a PR is changed (title as well as new / changed commits) on: @@ -30,14 +30,14 @@ jobs: name: Process runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Assign GitHub labels if: | github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'synchronize') - uses: actions/labeler@2.2.0 + uses: actions/labeler@v4.0.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} configuration-path: .github/workflows/dev_pr/labeler.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b3f6d9b61664..5e82d76febe6 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Docs +name: docs # trigger for all PRs and changes to master on: @@ -39,7 +39,7 @@ jobs: env: RUSTDOCFLAGS: "-Dwarnings" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Install python dev diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 71ec99d8dce2..81969466ebf4 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: Integration +name: integration # trigger for all PRs that touch certain files and changes to master on: @@ -36,18 +36,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout Arrow - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: apache/arrow submodules: true fetch-depth: 0 - name: Checkout Arrow Rust - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: rust fetch-depth: 0 - name: Setup Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: 3.8 - name: Setup Archery @@ -63,7 +63,7 @@ jobs: matrix: rust: [stable] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -82,7 +82,7 @@ jobs: path: /home/runner/target # this key is not equal because maturin uses different compilation flags. key: ${{ runner.os }}-${{ matrix.arch }}-target-maturin-cache-${{ matrix.rust }}- - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: python-version: '3.7' - name: Upgrade pip and setuptools diff --git a/.github/workflows/miri.yaml b/.github/workflows/miri.yaml index 732f92a1c36a..b4669bbcccc0 100644 --- a/.github/workflows/miri.yaml +++ b/.github/workflows/miri.yaml @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -name: MIRI +name: miri # trigger for all PRs that touch certain files and changes to master on: @@ -32,7 +32,7 @@ jobs: name: MIRI runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain diff --git a/.github/workflows/object_store.yml b/.github/workflows/object_store.yml index bf07a2efaad6..6c81604a96a2 100644 --- a/.github/workflows/object_store.yml +++ b/.github/workflows/object_store.yml @@ -36,7 +36,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain with clippy run: | rustup toolchain install stable @@ -44,7 +44,7 @@ jobs: rustup component add clippy - name: Run clippy run: | - cargo clippy -p object_store --all-features + cargo clippy -p object_store --all-features -- -D warnings # test the crate linux-test: @@ -85,7 +85,7 @@ jobs: OBJECT_STORE_BUCKET: test-bucket steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Configure Fake GCS Server (GCP emulation) run: | diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index d8e09f04ba83..e3f66751044f 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -43,7 +43,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -69,7 +69,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -118,7 +118,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: @@ -128,8 +128,4 @@ jobs: rustup component add clippy - name: Run clippy run: | - # Only run clippy for the library at this time, - # as there are clippy errors for other targets - cargo clippy -p parquet --all-features --lib -- -D warnings - # https://github.com/apache/arrow-rs/issues/1254 - #cargo clippy -p parquet --all-targets --all-features -- -D warnings + cargo clippy -p parquet --all-targets --all-features -- -D warnings diff --git a/.github/workflows/parquet_derive.yml b/.github/workflows/parquet_derive.yml index f7176498c55d..bd70fc30d1c5 100644 --- a/.github/workflows/parquet_derive.yml +++ b/.github/workflows/parquet_derive.yml @@ -44,7 +44,7 @@ jobs: # "1" means line tables only, which is useful for panic tracebacks. RUSTFLAGS: "-C debuginfo=1" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true - name: Setup Rust toolchain @@ -61,7 +61,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Rust toolchain uses: ./.github/actions/setup-builder with: diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8464a22b6b94..c04d5643b49a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -# tests for workspace wide -name: Rust +# workspace wide tests +name: rust # trigger for all PRs and changes to master on: @@ -28,22 +28,21 @@ on: jobs: # Check workspace wide compile and test with default features for - # mac and windows - windows-and-macos: - name: Test on ${{ matrix.os }} Rust ${{ matrix.rust }} - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ windows-latest, macos-latest ] - rust: [ stable ] + # mac + macos: + name: Test on Mac + runs-on: macos-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: submodules: true + - name: Install protoc with brew + run: | + brew install protobuf - name: Setup Rust toolchain run: | - rustup toolchain install ${{ matrix.rust }} --no-self-update - rustup default ${{ matrix.rust }} + rustup toolchain install stable --no-self-update + rustup default stable - name: Run tests shell: bash run: | @@ -52,6 +51,38 @@ jobs: cargo test + # Check workspace wide compile and test with default features for + # windows + windows: + name: Test on Windows + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + with: + submodules: true + - name: Install protobuf compiler in /d/protoc + shell: bash + run: | + mkdir /d/protoc + cd /d/protoc + curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protoc-21.4-win64.zip + unzip protoc-21.4-win64.zip + export PATH=$PATH:/d/protoc/bin + protoc --version + + - name: Setup Rust toolchain + run: | + rustup toolchain install stable --no-self-update + rustup default stable + - name: Run tests + shell: bash + run: | + # do not produce debug symbols to keep memory usage down + export RUSTFLAGS="-C debuginfo=0" + export PATH=$PATH:/d/protoc/bin + cargo test + + # Run cargo fmt for all crates lint: name: Lint (cargo fmt) @@ -59,7 +90,7 @@ jobs: container: image: amd64/rust steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup toolchain run: | rustup toolchain install stable @@ -67,33 +98,3 @@ jobs: rustup component add rustfmt - name: Run run: cargo fmt --all -- --check - - coverage: - name: Coverage - runs-on: ubuntu-latest - strategy: - matrix: - arch: [ amd64 ] - rust: [ stable ] - steps: - - uses: actions/checkout@v2 - with: - submodules: true - - name: Setup Rust toolchain - run: | - rustup toolchain install ${{ matrix.rust }} - rustup default ${{ matrix.rust }} - - name: Cache Cargo - uses: actions/cache@v3 - with: - path: /home/runner/.cargo - key: cargo-coverage-cache3- - - name: Run coverage - run: | - rustup toolchain install stable - rustup default stable - cargo install --version 0.18.2 cargo-tarpaulin - cargo tarpaulin --all --out Xml - - name: Report coverage - continue-on-error: true - run: bash <(curl -s https://codecov.io/bash) diff --git a/.github_changelog_generator b/.github_changelog_generator index cc23a6332d60..9a9a84344866 100644 --- a/.github_changelog_generator +++ b/.github_changelog_generator @@ -24,5 +24,5 @@ add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":[" #pull-requests=false # so that the component is shown associated with the issue issue-line-labels=arrow,parquet,arrow-flight -exclude-labels=development-process,invalid +exclude-labels=development-process,invalid,object-store breaking_labels=api-change diff --git a/CHANGELOG-old.md b/CHANGELOG-old.md index c0049af39b93..25be8961d2d8 100644 --- a/CHANGELOG-old.md +++ b/CHANGELOG-old.md @@ -20,6 +20,96 @@ # Historical Changelog +## [19.0.0](https://github.com/apache/arrow-rs/tree/19.0.0) (2022-07-22) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/18.0.0...19.0.0) + +**Breaking changes:** + +- Rename `DecimalArray``/DecimalBuilder` to `Decimal128Array`/`Decimal128Builder` [\#2101](https://github.com/apache/arrow-rs/issues/2101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Change builder `append` methods to be infallible where possible [\#2103](https://github.com/apache/arrow-rs/pull/2103) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Return reference from `UnionArray::child` \(\#2035\) [\#2099](https://github.com/apache/arrow-rs/pull/2099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove `preserve_order` feature from `serde_json` dependency \(\#2095\) [\#2098](https://github.com/apache/arrow-rs/pull/2098) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `num_days_since_sunday` [\#2066](https://github.com/apache/arrow-rs/pull/2066) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Remove `null_count` from `write_batch_with_statistics` [\#2047](https://github.com/apache/arrow-rs/pull/2047) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) + +**Implemented enhancements:** + +- Use `total_cmp` from std [\#2130](https://github.com/apache/arrow-rs/issues/2130) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Permit parallel fetching of column chunks in `ParquetRecordBatchStream` [\#2110](https://github.com/apache/arrow-rs/issues/2110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- The `GenericBinaryBuilder` should use buffer builders directly. [\#2104](https://github.com/apache/arrow-rs/issues/2104) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Pass `generate_decimal256_case` arrow integration test [\#2093](https://github.com/apache/arrow-rs/issues/2093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `days_since_sunday` [\#2065](https://github.com/apache/arrow-rs/issues/2065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of `filter_dict` [\#2062](https://github.com/apache/arrow-rs/issues/2062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve performance of `set_bits` [\#2060](https://github.com/apache/arrow-rs/issues/2060) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Lazily materialize the null buffer builder of `BooleanBuilder` [\#2058](https://github.com/apache/arrow-rs/issues/2058) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `BooleanArray::from_iter` should omit validity buffer if all values are valid [\#2055](https://github.com/apache/arrow-rs/issues/2055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- FFI\_ArrowSchema should set `DICTIONARY_ORDERED` flag if a field's dictionary is ordered [\#2049](https://github.com/apache/arrow-rs/issues/2049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `peek_next_page()` and `skip_next_page` in `SerializedPageReader` [\#2043](https://github.com/apache/arrow-rs/issues/2043) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support FFI / C Data Interface for `MapType` [\#2037](https://github.com/apache/arrow-rs/issues/2037) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- The `DecimalArrayBuilder` should use `FixedSizedBinaryBuilder` [\#2026](https://github.com/apache/arrow-rs/issues/2026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Enable `serialized_reader` read specific Page by passing row ranges. [\#1976](https://github.com/apache/arrow-rs/issues/1976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Fixed bugs:** + +- `type_id` and `value_offset` are incorrect for sliced `UnionArray` [\#2086](https://github.com/apache/arrow-rs/issues/2086) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Boolean `take` kernel does not handle null indices correctly [\#2057](https://github.com/apache/arrow-rs/issues/2057) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Don't double-count nulls in `write_batch_with_statistics` [\#2046](https://github.com/apache/arrow-rs/issues/2046) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Parquet Writer Ignores Statistics specification in `WriterProperties` [\#2014](https://github.com/apache/arrow-rs/issues/2014) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Documentation updates:** + +- Improve docstrings + examples for `as_primitive_array` cast functions [\#2114](https://github.com/apache/arrow-rs/pull/2114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Closed issues:** + +- Why does `serde_json` specify the `preserve_order` feature in `arrow` package [\#2095](https://github.com/apache/arrow-rs/issues/2095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `skip_values` in DictionaryDecoder [\#2079](https://github.com/apache/arrow-rs/issues/2079) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support skip\_values in ColumnValueDecoderImpl [\#2078](https://github.com/apache/arrow-rs/issues/2078) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support `skip_values` in `ByteArrayColumnValueDecoder` [\#2072](https://github.com/apache/arrow-rs/issues/2072) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Several `Builder::append` methods returning results even though they are infallible [\#2071](https://github.com/apache/arrow-rs/issues/2071) +- Improve formatting of logical plans containing subqueries [\#2059](https://github.com/apache/arrow-rs/issues/2059) +- Return reference from `UnionArray::child` [\#2035](https://github.com/apache/arrow-rs/issues/2035) +- support write page index [\#1777](https://github.com/apache/arrow-rs/issues/1777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] + +**Merged pull requests:** + +- Use `total_cmp` from std [\#2131](https://github.com/apache/arrow-rs/pull/2131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- fix clippy [\#2124](https://github.com/apache/arrow-rs/pull/2124) ([alamb](https://github.com/alamb)) +- Fix logical merge conflict: `match` arms have incompatible types [\#2121](https://github.com/apache/arrow-rs/pull/2121) ([alamb](https://github.com/alamb)) +- Update `GenericBinaryBuilder` to use buffer builders directly. [\#2117](https://github.com/apache/arrow-rs/pull/2117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Simplify null mask preservation in parquet reader [\#2116](https://github.com/apache/arrow-rs/pull/2116) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add get\_byte\_ranges method to AsyncFileReader trait [\#2115](https://github.com/apache/arrow-rs/pull/2115) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- add test for skip\_values in DictionaryDecoder and fix it [\#2105](https://github.com/apache/arrow-rs/pull/2105) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Define Decimal128Builder and Decimal128Array [\#2102](https://github.com/apache/arrow-rs/pull/2102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support skip\_values in DictionaryDecoder [\#2100](https://github.com/apache/arrow-rs/pull/2100) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Pass generate\_decimal256\_case integration test, add `DataType::Decimal256` [\#2094](https://github.com/apache/arrow-rs/pull/2094) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- `DecimalBuilder` should use `FixedSizeBinaryBuilder` [\#2092](https://github.com/apache/arrow-rs/pull/2092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Array writer indirection [\#2091](https://github.com/apache/arrow-rs/pull/2091) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove doc hidden from GenericColumnReader [\#2090](https://github.com/apache/arrow-rs/pull/2090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support skip\_values in ColumnValueDecoderImpl [\#2089](https://github.com/apache/arrow-rs/pull/2089) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- type\_id and value\_offset are incorrect for sliced UnionArray [\#2087](https://github.com/apache/arrow-rs/pull/2087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add IPC truncation test case for StructArray [\#2083](https://github.com/apache/arrow-rs/pull/2083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Improve performance of set\_bits by using copy\_from\_slice instead of setting individual bytes [\#2077](https://github.com/apache/arrow-rs/pull/2077) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Support skip\_values in ByteArrayColumnValueDecoder [\#2076](https://github.com/apache/arrow-rs/pull/2076) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Lazily materialize the null buffer builder of boolean builder [\#2073](https://github.com/apache/arrow-rs/pull/2073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Fix windows CI \(\#2069\) [\#2070](https://github.com/apache/arrow-rs/pull/2070) ([tustvold](https://github.com/tustvold)) +- Test utf8\_validation checks char boundaries [\#2068](https://github.com/apache/arrow-rs/pull/2068) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- feat\(compute\): Support doy \(day of year\) for temporal [\#2067](https://github.com/apache/arrow-rs/pull/2067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) +- Support nullable indices in boolean take kernel and some optimizations [\#2064](https://github.com/apache/arrow-rs/pull/2064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- Improve performance of filter\_dict [\#2063](https://github.com/apache/arrow-rs/pull/2063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Ignore null buffer when creating ArrayData if null count is zero [\#2056](https://github.com/apache/arrow-rs/pull/2056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) +- feat\(compute\): Support week0 \(PostgreSQL behaviour\) for temporal [\#2052](https://github.com/apache/arrow-rs/pull/2052) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) +- Set DICTIONARY\_ORDERED flag for FFI\_ArrowSchema [\#2050](https://github.com/apache/arrow-rs/pull/2050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Generify parquet write path \(\#1764\) [\#2045](https://github.com/apache/arrow-rs/pull/2045) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Support peek\_next\_page\(\) and skip\_next\_page in serialized\_reader. [\#2044](https://github.com/apache/arrow-rs/pull/2044) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Support MapType in FFI [\#2042](https://github.com/apache/arrow-rs/pull/2042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add support of converting `FixedSizeBinaryArray` to `DecimalArray` [\#2041](https://github.com/apache/arrow-rs/pull/2041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Truncate IPC record batch [\#2040](https://github.com/apache/arrow-rs/pull/2040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Refine the List builder [\#2034](https://github.com/apache/arrow-rs/pull/2034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Add more tests of RecordReader Batch Size Edge Cases \(\#2025\) [\#2032](https://github.com/apache/arrow-rs/pull/2032) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add support for adding intervals to dates [\#2031](https://github.com/apache/arrow-rs/pull/2031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) + ## [18.0.0](https://github.com/apache/arrow-rs/tree/18.0.0) (2022-07-08) [Full Changelog](https://github.com/apache/arrow-rs/compare/17.0.0...18.0.0) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9ca0d911016..87f67015f22e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,95 +19,155 @@ # Changelog -## [19.0.0](https://github.com/apache/arrow-rs/tree/19.0.0) (2022-07-22) +## [20.0.0](https://github.com/apache/arrow-rs/tree/20.0.0) (2022-08-05) -[Full Changelog](https://github.com/apache/arrow-rs/compare/18.0.0...19.0.0) +[Full Changelog](https://github.com/apache/arrow-rs/compare/19.0.0...20.0.0) **Breaking changes:** -- Rename `DecimalArray``/DecimalBuilder` to `Decimal128Array`/`Decimal128Builder` [\#2101](https://github.com/apache/arrow-rs/issues/2101) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Change builder `append` methods to be infallible where possible [\#2103](https://github.com/apache/arrow-rs/pull/2103) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Return reference from `UnionArray::child` \(\#2035\) [\#2099](https://github.com/apache/arrow-rs/pull/2099) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Remove `preserve_order` feature from `serde_json` dependency \(\#2095\) [\#2098](https://github.com/apache/arrow-rs/pull/2098) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `num_days_since_sunday` [\#2066](https://github.com/apache/arrow-rs/pull/2066) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) -- Remove `null_count` from `write_batch_with_statistics` [\#2047](https://github.com/apache/arrow-rs/pull/2047) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add more const evaluation for `GenericBinaryArray` and `GenericListArray`: add `PREFIX` and data type constructor [\#2327](https://github.com/apache/arrow-rs/pull/2327) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Make FFI support optional, change APIs to be `safe` \(\#2302\) [\#2303](https://github.com/apache/arrow-rs/pull/2303) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Remove `test_utils` from default features \(\#2298\) [\#2299](https://github.com/apache/arrow-rs/pull/2299) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Rename `DataType::Decimal` to `DataType::Decimal128` [\#2229](https://github.com/apache/arrow-rs/pull/2229) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Add `Decimal128Iter` and `Decimal256Iter` and do maximum precision/scale check [\#2140](https://github.com/apache/arrow-rs/pull/2140) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) **Implemented enhancements:** -- Use `total_cmp` from std [\#2130](https://github.com/apache/arrow-rs/issues/2130) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Permit parallel fetching of column chunks in `ParquetRecordBatchStream` [\#2110](https://github.com/apache/arrow-rs/issues/2110) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- The `GenericBinaryBuilder` should use buffer builders directly. [\#2104](https://github.com/apache/arrow-rs/issues/2104) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Pass `generate_decimal256_case` arrow integration test [\#2093](https://github.com/apache/arrow-rs/issues/2093) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Rename `weekday` and `weekday0` kernels to to `num_days_from_monday` and `days_since_sunday` [\#2065](https://github.com/apache/arrow-rs/issues/2065) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of `filter_dict` [\#2062](https://github.com/apache/arrow-rs/issues/2062) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Improve performance of `set_bits` [\#2060](https://github.com/apache/arrow-rs/issues/2060) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Lazily materialize the null buffer builder of `BooleanBuilder` [\#2058](https://github.com/apache/arrow-rs/issues/2058) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- `BooleanArray::from_iter` should omit validity buffer if all values are valid [\#2055](https://github.com/apache/arrow-rs/issues/2055) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- FFI\_ArrowSchema should set `DICTIONARY_ORDERED` flag if a field's dictionary is ordered [\#2049](https://github.com/apache/arrow-rs/issues/2049) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `peek_next_page()` and `skip_next_page` in `SerializedPageReader` [\#2043](https://github.com/apache/arrow-rs/issues/2043) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support FFI / C Data Interface for `MapType` [\#2037](https://github.com/apache/arrow-rs/issues/2037) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- The `DecimalArrayBuilder` should use `FixedSizedBinaryBuilder` [\#2026](https://github.com/apache/arrow-rs/issues/2026) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Enable `serialized_reader` read specific Page by passing row ranges. [\#1976](https://github.com/apache/arrow-rs/issues/1976) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add the constant data type constructors for `ListArray` [\#2311](https://github.com/apache/arrow-rs/issues/2311) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update `FlightSqlService` trait to pass session info along [\#2308](https://github.com/apache/arrow-rs/issues/2308) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Optimize `take_bits` for non-null indices [\#2306](https://github.com/apache/arrow-rs/issues/2306) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Make FFI support optional via Feature Flag `ffi` [\#2302](https://github.com/apache/arrow-rs/issues/2302) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Mark `ffi::ArrowArray::try_new` is safe [\#2301](https://github.com/apache/arrow-rs/issues/2301) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove test\_utils from default arrow-rs features [\#2298](https://github.com/apache/arrow-rs/issues/2298) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Remove `JsonEqual` trait [\#2296](https://github.com/apache/arrow-rs/issues/2296) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Move `with_precision_and_scale` to `Decimal` array traits [\#2291](https://github.com/apache/arrow-rs/issues/2291) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Improve readability and maybe performance of string --\> numeric/time/date/timetamp cast kernels [\#2285](https://github.com/apache/arrow-rs/issues/2285) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add vectorized unpacking for 8, 16, and 64 bit integers [\#2276](https://github.com/apache/arrow-rs/issues/2276) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Use initial capacity for interner hashmap [\#2273](https://github.com/apache/arrow-rs/issues/2273) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Impl FromIterator for Decimal256Array [\#2248](https://github.com/apache/arrow-rs/issues/2248) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Separate `ArrayReader::next_batch`with `ArrayReader::read_records` and `ArrayReader::consume_batch` [\#2236](https://github.com/apache/arrow-rs/issues/2236) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Rename `DataType::Decimal` to `DataType::Decimal128` [\#2228](https://github.com/apache/arrow-rs/issues/2228) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Automatically Grow Parquet BitWriter Buffer [\#2226](https://github.com/apache/arrow-rs/issues/2226) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add `append_option` support to `Decimal128Builder` and `Decimal256Builder` [\#2224](https://github.com/apache/arrow-rs/issues/2224) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Split the `FixedSizeBinaryArray` and `FixedSizeListArray` from `array_binary.rs` and `array_list.rs` [\#2217](https://github.com/apache/arrow-rs/issues/2217) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Don't `Box` Values in `PrimitiveDictionaryBuilder` [\#2215](https://github.com/apache/arrow-rs/issues/2215) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use BitChunks in equal\_bits [\#2186](https://github.com/apache/arrow-rs/issues/2186) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Implement `Hash` for `Schema` [\#2182](https://github.com/apache/arrow-rs/issues/2182) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- read decimal data type from parquet file with binary physical type [\#2159](https://github.com/apache/arrow-rs/issues/2159) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- The `GenericStringBuilder` should use `GenericBinaryBuilder` [\#2156](https://github.com/apache/arrow-rs/issues/2156) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update Rust version to 1.62 [\#2143](https://github.com/apache/arrow-rs/issues/2143) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- Check precision and scale against maximum value when constructing `Decimal128` and `Decimal256` [\#2139](https://github.com/apache/arrow-rs/issues/2139) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `ArrayAccessor` in `Decimal128Iter` and `Decimal256Iter` [\#2138](https://github.com/apache/arrow-rs/issues/2138) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `ArrayAccessor` and `FromIterator` in Cast Kernels [\#2137](https://github.com/apache/arrow-rs/issues/2137) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Add `TypedDictionaryArray` for more ergonomic interaction with `DictionaryArray` [\#2136](https://github.com/apache/arrow-rs/issues/2136) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Use `ArrayAccessor` in Comparison Kernels [\#2135](https://github.com/apache/arrow-rs/issues/2135) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `peek_next_page()` and s`kip_next_page` in `InMemoryColumnChunkReader` [\#2129](https://github.com/apache/arrow-rs/issues/2129) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Lazily materialize the null buffer builder for all array builders. [\#2125](https://github.com/apache/arrow-rs/issues/2125) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Do value validation for `Decimal256` [\#2112](https://github.com/apache/arrow-rs/issues/2112) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Support `skip_def_levels` for `ColumnLevelDecoder` [\#2107](https://github.com/apache/arrow-rs/issues/2107) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Add integration test for scan rows with selection [\#2106](https://github.com/apache/arrow-rs/issues/2106) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support for casting from Utf8/String to `Time32` / `Time64` [\#2053](https://github.com/apache/arrow-rs/issues/2053) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Update prost and tonic related crates [\#2268](https://github.com/apache/arrow-rs/pull/2268) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([carols10cents](https://github.com/carols10cents)) **Fixed bugs:** -- `type_id` and `value_offset` are incorrect for sliced `UnionArray` [\#2086](https://github.com/apache/arrow-rs/issues/2086) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Boolean `take` kernel does not handle null indices correctly [\#2057](https://github.com/apache/arrow-rs/issues/2057) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Don't double-count nulls in `write_batch_with_statistics` [\#2046](https://github.com/apache/arrow-rs/issues/2046) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Parquet Writer Ignores Statistics specification in `WriterProperties` [\#2014](https://github.com/apache/arrow-rs/issues/2014) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- temporal conversion functions cannot work on negative input properly [\#2325](https://github.com/apache/arrow-rs/issues/2325) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- IPC writer should truncate string array with all empty string [\#2312](https://github.com/apache/arrow-rs/issues/2312) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Error order for comparing `Decimal128` or `Decimal256` [\#2256](https://github.com/apache/arrow-rs/issues/2256) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Fix maximum and minimum for decimal values for precision greater than 38 [\#2246](https://github.com/apache/arrow-rs/issues/2246) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `IntervalMonthDayNanoType::make_value()` does not match C implementation [\#2234](https://github.com/apache/arrow-rs/issues/2234) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- `FlightSqlService` trait does not allow `impl`s to do handshake [\#2210](https://github.com/apache/arrow-rs/issues/2210) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] +- `EnabledStatistics::None` not working [\#2185](https://github.com/apache/arrow-rs/issues/2185) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Boolean ArrayData Equality Incorrect Slice Handling [\#2184](https://github.com/apache/arrow-rs/issues/2184) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] +- Publicly export MapFieldNames [\#2118](https://github.com/apache/arrow-rs/issues/2118) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] **Documentation updates:** -- Improve docstrings + examples for `as_primitive_array` cast functions [\#2114](https://github.com/apache/arrow-rs/pull/2114) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Update instructions on How to join the slack \#arrow-rust channel -- or maybe try to switch to discord?? [\#2192](https://github.com/apache/arrow-rs/issues/2192) +- \[Minor\] Improve arrow and parquet READMEs, document parquet feature flags [\#2324](https://github.com/apache/arrow-rs/pull/2324) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) + +**Performance improvements:** + +- Improve speed of writing string dictionaries to parquet by skipping a copy\(\#1764\) [\#2322](https://github.com/apache/arrow-rs/pull/2322) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) **Closed issues:** -- Why does `serde_json` specify the `preserve_order` feature in `arrow` package [\#2095](https://github.com/apache/arrow-rs/issues/2095) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] -- Support `skip_values` in DictionaryDecoder [\#2079](https://github.com/apache/arrow-rs/issues/2079) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support skip\_values in ColumnValueDecoderImpl [\#2078](https://github.com/apache/arrow-rs/issues/2078) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Support `skip_values` in `ByteArrayColumnValueDecoder` [\#2072](https://github.com/apache/arrow-rs/issues/2072) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] -- Several `Builder::append` methods returning results even though they are infallible [\#2071](https://github.com/apache/arrow-rs/issues/2071) -- Improve formatting of logical plans containing subqueries [\#2059](https://github.com/apache/arrow-rs/issues/2059) -- Return reference from `UnionArray::child` [\#2035](https://github.com/apache/arrow-rs/issues/2035) -- support write page index [\#1777](https://github.com/apache/arrow-rs/issues/1777) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Fix wrong logic in calculate\_row\_count when skipping values [\#2328](https://github.com/apache/arrow-rs/issues/2328) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Support filter for parquet data type [\#2126](https://github.com/apache/arrow-rs/issues/2126) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] +- Make skip value in ByteArrayDecoderDictionary avoid decoding [\#2088](https://github.com/apache/arrow-rs/issues/2088) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] **Merged pull requests:** -- Use `total_cmp` from std [\#2131](https://github.com/apache/arrow-rs/pull/2131) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) -- fix clippy [\#2124](https://github.com/apache/arrow-rs/pull/2124) ([alamb](https://github.com/alamb)) -- Fix logical merge conflict: `match` arms have incompatible types [\#2121](https://github.com/apache/arrow-rs/pull/2121) ([alamb](https://github.com/alamb)) -- Update `GenericBinaryBuilder` to use buffer builders directly. [\#2117](https://github.com/apache/arrow-rs/pull/2117) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Simplify null mask preservation in parquet reader [\#2116](https://github.com/apache/arrow-rs/pull/2116) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add get\_byte\_ranges method to AsyncFileReader trait [\#2115](https://github.com/apache/arrow-rs/pull/2115) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- add test for skip\_values in DictionaryDecoder and fix it [\#2105](https://github.com/apache/arrow-rs/pull/2105) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Define Decimal128Builder and Decimal128Array [\#2102](https://github.com/apache/arrow-rs/pull/2102) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Support skip\_values in DictionaryDecoder [\#2100](https://github.com/apache/arrow-rs/pull/2100) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- Pass generate\_decimal256\_case integration test, add `DataType::Decimal256` [\#2094](https://github.com/apache/arrow-rs/pull/2094) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- `DecimalBuilder` should use `FixedSizeBinaryBuilder` [\#2092](https://github.com/apache/arrow-rs/pull/2092) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Array writer indirection [\#2091](https://github.com/apache/arrow-rs/pull/2091) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Remove doc hidden from GenericColumnReader [\#2090](https://github.com/apache/arrow-rs/pull/2090) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support skip\_values in ColumnValueDecoderImpl [\#2089](https://github.com/apache/arrow-rs/pull/2089) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) -- type\_id and value\_offset are incorrect for sliced UnionArray [\#2087](https://github.com/apache/arrow-rs/pull/2087) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add IPC truncation test case for StructArray [\#2083](https://github.com/apache/arrow-rs/pull/2083) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Improve performance of set\_bits by using copy\_from\_slice instead of setting individual bytes [\#2077](https://github.com/apache/arrow-rs/pull/2077) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Support skip\_values in ByteArrayColumnValueDecoder [\#2076](https://github.com/apache/arrow-rs/pull/2076) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Lazily materialize the null buffer builder of boolean builder [\#2073](https://github.com/apache/arrow-rs/pull/2073) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Fix windows CI \(\#2069\) [\#2070](https://github.com/apache/arrow-rs/pull/2070) ([tustvold](https://github.com/tustvold)) -- Test utf8\_validation checks char boundaries [\#2068](https://github.com/apache/arrow-rs/pull/2068) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) -- feat\(compute\): Support doy \(day of year\) for temporal [\#2067](https://github.com/apache/arrow-rs/pull/2067) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) -- Support nullable indices in boolean take kernel and some optimizations [\#2064](https://github.com/apache/arrow-rs/pull/2064) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- Improve performance of filter\_dict [\#2063](https://github.com/apache/arrow-rs/pull/2063) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Ignore null buffer when creating ArrayData if null count is zero [\#2056](https://github.com/apache/arrow-rs/pull/2056) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([jhorstmann](https://github.com/jhorstmann)) -- feat\(compute\): Support week0 \(PostgreSQL behaviour\) for temporal [\#2052](https://github.com/apache/arrow-rs/pull/2052) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([ovr](https://github.com/ovr)) -- Set DICTIONARY\_ORDERED flag for FFI\_ArrowSchema [\#2050](https://github.com/apache/arrow-rs/pull/2050) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Generify parquet write path \(\#1764\) [\#2045](https://github.com/apache/arrow-rs/pull/2045) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Support peek\_next\_page\(\) and skip\_next\_page in serialized\_reader. [\#2044](https://github.com/apache/arrow-rs/pull/2044) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) -- Support MapType in FFI [\#2042](https://github.com/apache/arrow-rs/pull/2042) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Add support of converting `FixedSizeBinaryArray` to `DecimalArray` [\#2041](https://github.com/apache/arrow-rs/pull/2041) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Truncate IPC record batch [\#2040](https://github.com/apache/arrow-rs/pull/2040) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) -- Refine the List builder [\#2034](https://github.com/apache/arrow-rs/pull/2034) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) -- Add more tests of RecordReader Batch Size Edge Cases \(\#2025\) [\#2032](https://github.com/apache/arrow-rs/pull/2032) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) -- Add support for adding intervals to dates [\#2031](https://github.com/apache/arrow-rs/pull/2031) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- fix: Fix skip error in calculate\_row\_count. [\#2329](https://github.com/apache/arrow-rs/pull/2329) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- temporal conversion functions should work on negative input properly [\#2326](https://github.com/apache/arrow-rs/pull/2326) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Increase DeltaBitPackEncoder miniblock size to 64 for 64-bit integers \(\#2282\) [\#2319](https://github.com/apache/arrow-rs/pull/2319) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Remove JsonEqual [\#2317](https://github.com/apache/arrow-rs/pull/2317) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- fix: IPC writer should truncate string array with all empty string [\#2314](https://github.com/apache/arrow-rs/pull/2314) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([JasonLi-cn](https://github.com/JasonLi-cn)) +- Pass pull `Request` to `FlightSqlService` `impl`s [\#2309](https://github.com/apache/arrow-rs/pull/2309) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Speedup take\_boolean / take\_bits for non-null indices \(~4 - 5x speedup\) [\#2307](https://github.com/apache/arrow-rs/pull/2307) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([Dandandan](https://github.com/Dandandan)) +- Add typed dictionary \(\#2136\) [\#2297](https://github.com/apache/arrow-rs/pull/2297) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- \[Minor\] Improve types shown in cast error messages [\#2295](https://github.com/apache/arrow-rs/pull/2295) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Move `with_precision_and_scale` to `BasicDecimalArray` trait [\#2292](https://github.com/apache/arrow-rs/pull/2292) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Replace the `fn get_data_type` by `const DATA_TYPE` in BinaryArray and StringArray [\#2289](https://github.com/apache/arrow-rs/pull/2289) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Clean up string casts and improve performance [\#2284](https://github.com/apache/arrow-rs/pull/2284) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Minor\] Add tests for temporal cast error paths [\#2283](https://github.com/apache/arrow-rs/pull/2283) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Add unpack8, unpack16, unpack64 \(\#2276\) ~10-50% faster [\#2278](https://github.com/apache/arrow-rs/pull/2278) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix bugs in the `from_list` function. [\#2277](https://github.com/apache/arrow-rs/pull/2277) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- fix: use signed comparator to compare decimal128 and decimal256 [\#2275](https://github.com/apache/arrow-rs/pull/2275) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([liukun4515](https://github.com/liukun4515)) +- Use initial capacity for interner hashmap [\#2272](https://github.com/apache/arrow-rs/pull/2272) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Dandandan](https://github.com/Dandandan)) +- Remove fallibility from paruqet RleEncoder \(\#2226\) [\#2259](https://github.com/apache/arrow-rs/pull/2259) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Fix escaped like wildcards in `like_utf8` / `nlike_utf8` kernels [\#2258](https://github.com/apache/arrow-rs/pull/2258) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([daniel-martinez-maqueda-sap](https://github.com/daniel-martinez-maqueda-sap)) +- Add tests for reading nested decimal arrays from parquet [\#2254](https://github.com/apache/arrow-rs/pull/2254) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- feat: Implement string cast operations for Time32 and Time64 [\#2251](https://github.com/apache/arrow-rs/pull/2251) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([stuartcarnie](https://github.com/stuartcarnie)) +- move `FixedSizeList` to `array_fixed_size_list.rs` [\#2250](https://github.com/apache/arrow-rs/pull/2250) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Impl FromIterator for Decimal256Array [\#2247](https://github.com/apache/arrow-rs/pull/2247) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Fix max and min value for decimal precision greater than 38 [\#2245](https://github.com/apache/arrow-rs/pull/2245) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Make `Schema::fields` and `Schema::metadata` `pub` \(public\) [\#2239](https://github.com/apache/arrow-rs/pull/2239) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- \[Minor\] Improve Schema metadata mismatch error [\#2238](https://github.com/apache/arrow-rs/pull/2238) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([alamb](https://github.com/alamb)) +- Separate ArrayReader::next\_batch with read\_records and consume\_batch [\#2237](https://github.com/apache/arrow-rs/pull/2237) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Update `IntervalMonthDayNanoType::make_value()` to conform to specifications [\#2235](https://github.com/apache/arrow-rs/pull/2235) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([avantgardnerio](https://github.com/avantgardnerio)) +- Disable value validation for Decimal256 case [\#2232](https://github.com/apache/arrow-rs/pull/2232) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Automatically grow parquet BitWriter \(\#2226\) \(~10% faster\) [\#2231](https://github.com/apache/arrow-rs/pull/2231) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) +- Add append\_option support to decimal builders [\#2225](https://github.com/apache/arrow-rs/pull/2225) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([bphillips-exos](https://github.com/bphillips-exos)) +- Optimized writing of byte array to parquet \(\#1764\) \(2x faster\) [\#2221](https://github.com/apache/arrow-rs/pull/2221) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Increase test coverage of ArrowWriter [\#2220](https://github.com/apache/arrow-rs/pull/2220) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) +- Move `FixedSizeBinaryArray` to `array_fixed_size_binary.rs` [\#2218](https://github.com/apache/arrow-rs/pull/2218) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Avoid boxing in PrimitiveDictionaryBuilder [\#2216](https://github.com/apache/arrow-rs/pull/2216) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- remove redundant CI benchmark check, cleanups [\#2212](https://github.com/apache/arrow-rs/pull/2212) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([alamb](https://github.com/alamb)) +- Update `FlightSqlService` trait to proxy handshake [\#2211](https://github.com/apache/arrow-rs/pull/2211) [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([avantgardnerio](https://github.com/avantgardnerio)) +- parquet: export json api with `serde_json` feature name [\#2209](https://github.com/apache/arrow-rs/pull/2209) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([flisky](https://github.com/flisky)) +- Cleanup record skipping logic and tests \(\#2158\) [\#2199](https://github.com/apache/arrow-rs/pull/2199) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Use BitChunks in equal\_bits [\#2194](https://github.com/apache/arrow-rs/pull/2194) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Fix disabling parquet statistics \(\#2185\) [\#2191](https://github.com/apache/arrow-rs/pull/2191) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) +- Fix offset handling in boolean\_equal \(\#2184\) [\#2187](https://github.com/apache/arrow-rs/pull/2187) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Implement `Hash` for `Schema` [\#2183](https://github.com/apache/arrow-rs/pull/2183) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([crepererum](https://github.com/crepererum)) +- Let the `StringBuilder` use `BinaryBuilder` [\#2181](https://github.com/apache/arrow-rs/pull/2181) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Use ArrayAccessor and FromIterator in Cast Kernels [\#2169](https://github.com/apache/arrow-rs/pull/2169) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) +- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) +- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) +- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) +- \[minor\] use type aliases refine code. [\#2161](https://github.com/apache/arrow-rs/pull/2161) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- parquet reader: Support reading decimals from parquet `BYTE_ARRAY` type [\#2160](https://github.com/apache/arrow-rs/pull/2160) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([liukun4515](https://github.com/liukun4515)) +- Add integration test for scan rows with selection [\#2158](https://github.com/apache/arrow-rs/pull/2158) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Use ArrayAccessor in Comparison Kernels [\#2157](https://github.com/apache/arrow-rs/pull/2157) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Implement `peek\_next\_page` and `skip\_next\_page` for `InMemoryColumnCh… [\#2155](https://github.com/apache/arrow-rs/pull/2155) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Avoid decoding unneeded values in ByteArrayDecoderDictionary [\#2154](https://github.com/apache/arrow-rs/pull/2154) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([thinkharderdev](https://github.com/thinkharderdev)) +- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) +- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) +- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) +- Update rust version to 1.62 [\#2144](https://github.com/apache/arrow-rs/pull/2144) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] [[arrow-flight](https://github.com/apache/arrow-rs/labels/arrow-flight)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Make MapFieldNames public \(\#2118\) [\#2134](https://github.com/apache/arrow-rs/pull/2134) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Add ArrayAccessor trait, remove duplication in array iterators \(\#1948\) [\#2133](https://github.com/apache/arrow-rs/pull/2133) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tustvold](https://github.com/tustvold)) +- Lazily materialize the null buffer builder for all array builders. [\#2127](https://github.com/apache/arrow-rs/pull/2127) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([HaoYang670](https://github.com/HaoYang670)) +- Faster parquet DictEncoder \(~20%\) [\#2123](https://github.com/apache/arrow-rs/pull/2123) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([tustvold](https://github.com/tustvold)) +- Add validation for Decimal256 [\#2113](https://github.com/apache/arrow-rs/pull/2113) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([viirya](https://github.com/viirya)) +- Support skip\_def\_levels for ColumnLevelDecoder [\#2111](https://github.com/apache/arrow-rs/pull/2111) [[parquet](https://github.com/apache/arrow-rs/labels/parquet)] ([Ted-Jiang](https://github.com/Ted-Jiang)) +- Donate `object_store` code from object\_store\_rs to arrow-rs [\#2081](https://github.com/apache/arrow-rs/pull/2081) ([alamb](https://github.com/alamb)) +- Improve `validate_utf8` performance [\#2048](https://github.com/apache/arrow-rs/pull/2048) [[arrow](https://github.com/apache/arrow-rs/labels/arrow)] ([tfeda](https://github.com/tfeda)) diff --git a/README.md b/README.md index 08385fb6c15d..987826b32216 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,11 @@ You can find more details about each crate in their respective READMEs. The `dev@arrow.apache.org` mailing list serves as the core communication channel for the Arrow community. Instructions for signing up and links to the archives can be found at the [Arrow Community](https://arrow.apache.org/community/) page. All major announcements and communications happen there. The Rust Arrow community also uses the official [ASF Slack](https://s.apache.org/slack-invite) for informal discussions and coordination. This is -a great place to meet other contributors and get guidance on where to contribute. Join us in the `#arrow-rust` channel. +a great place to meet other contributors and get guidance on where to contribute. Join us in the `#arrow-rust` channel and feel free to ask for an invite via: + +1. the `dev@arrow.apache.org` mailing list +2. the [GitHub Discussions][discussions] +3. the [Discord channel](https://discord.gg/YAb2TdazKQ) Unlike other parts of the Arrow ecosystem, the Rust implementation uses [GitHub issues][issues] as the system of record for new features and bug fixes and this plays a critical role in the release process. @@ -68,3 +72,4 @@ There is more information in the [contributing] guide. [datafusion-readme]: https://github.com/apache/arrow-datafusion/blob/master/README.md [ballista-readme]: https://github.com/apache/arrow-ballista/blob/master/README.md [issues]: https://github.com/apache/arrow-rs/issues +[discussions]: https://github.com/apache/arrow-rs/discussions diff --git a/arrow-flight/Cargo.toml b/arrow-flight/Cargo.toml index 1400d913e797..92c6aac3d082 100644 --- a/arrow-flight/Cargo.toml +++ b/arrow-flight/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-flight" description = "Apache Arrow Flight" -version = "19.0.0" +version = "20.0.0" edition = "2021" rust-version = "1.62" authors = ["Apache Arrow "] @@ -27,13 +27,13 @@ repository = "https://github.com/apache/arrow-rs" license = "Apache-2.0" [dependencies] -arrow = { path = "../arrow", version = "19.0.0", default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "20.0.0", default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false } -tonic = { version = "0.7", default-features = false, features = ["transport", "codegen", "prost"] } +tonic = { version = "0.8", default-features = false, features = ["transport", "codegen", "prost"] } bytes = { version = "1", default-features = false } -prost = { version = "0.10", default-features = false } -prost-types = { version = "0.10.0", default-features = false, optional = true } -prost-derive = { version = "0.10", default-features = false } +prost = { version = "0.11", default-features = false } +prost-types = { version = "0.11.0", default-features = false, optional = true } +prost-derive = { version = "0.11", default-features = false } tokio = { version = "1.0", default-features = false, features = ["macros", "rt", "rt-multi-thread"] } futures = { version = "0.3", default-features = false, features = ["alloc"]} @@ -44,7 +44,7 @@ flight-sql-experimental = ["prost-types"] [dev-dependencies] [build-dependencies] -tonic-build = { version = "0.7", default-features = false, features = ["transport", "prost"] } +tonic-build = { version = "0.8", default-features = false, features = ["transport", "prost"] } # Pin specific version of the tonic-build dependencies to avoid auto-generated # (and checked in) arrow.flight.protocol.rs from changing proc-macro2 = { version = ">1.0.30", default-features = false } diff --git a/arrow-flight/README.md b/arrow-flight/README.md index cbe10d9bec74..db9b75377d29 100644 --- a/arrow-flight/README.md +++ b/arrow-flight/README.md @@ -27,7 +27,7 @@ Add this to your Cargo.toml: ```toml [dependencies] -arrow-flight = "19.0.0" +arrow-flight = "20.0.0" ``` Apache Arrow Flight is a gRPC based protocol for exchanging Arrow data between processes. See the blog post [Introducing Apache Arrow Flight: A Framework for Fast Data Transport](https://arrow.apache.org/blog/2019/10/13/introducing-arrow-flight/) for more information. diff --git a/arrow-flight/examples/flight_sql_server.rs b/arrow-flight/examples/flight_sql_server.rs index 7e2a759c5590..aa0d407113d7 100644 --- a/arrow-flight/examples/flight_sql_server.rs +++ b/arrow-flight/examples/flight_sql_server.rs @@ -16,7 +16,7 @@ // under the License. use arrow_flight::sql::{ActionCreatePreparedStatementResult, SqlInfo}; -use arrow_flight::{FlightData, HandshakeRequest, HandshakeResponse}; +use arrow_flight::{Action, FlightData, HandshakeRequest, HandshakeResponse, Ticket}; use futures::Stream; use std::pin::Pin; use tonic::transport::Server; @@ -93,179 +93,253 @@ impl FlightSqlService for FlightSqlServiceImpl { async fn get_flight_info_statement( &self, _query: CommandStatementQuery, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_statement not implemented", + )) } + async fn get_flight_info_prepared_statement( &self, _query: CommandPreparedStatementQuery, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_prepared_statement not implemented", + )) } + async fn get_flight_info_catalogs( &self, _query: CommandGetCatalogs, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_catalogs not implemented", + )) } + async fn get_flight_info_schemas( &self, _query: CommandGetDbSchemas, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_schemas not implemented", + )) } + async fn get_flight_info_tables( &self, _query: CommandGetTables, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_tables not implemented", + )) } + async fn get_flight_info_table_types( &self, _query: CommandGetTableTypes, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_table_types not implemented", + )) } + async fn get_flight_info_sql_info( &self, _query: CommandGetSqlInfo, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_sql_info not implemented", + )) } + async fn get_flight_info_primary_keys( &self, _query: CommandGetPrimaryKeys, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_primary_keys not implemented", + )) } + async fn get_flight_info_exported_keys( &self, _query: CommandGetExportedKeys, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_exported_keys not implemented", + )) } + async fn get_flight_info_imported_keys( &self, _query: CommandGetImportedKeys, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_imported_keys not implemented", + )) } + async fn get_flight_info_cross_reference( &self, _query: CommandGetCrossReference, - _request: FlightDescriptor, + _request: Request, ) -> Result, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "get_flight_info_imported_keys not implemented", + )) } + // do_get async fn do_get_statement( &self, _ticket: TicketStatementQuery, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_statement not implemented")) } async fn do_get_prepared_statement( &self, _query: CommandPreparedStatementQuery, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_prepared_statement not implemented", + )) } + async fn do_get_catalogs( &self, _query: CommandGetCatalogs, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_catalogs not implemented")) } + async fn do_get_schemas( &self, _query: CommandGetDbSchemas, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_schemas not implemented")) } + async fn do_get_tables( &self, _query: CommandGetTables, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_tables not implemented")) } + async fn do_get_table_types( &self, _query: CommandGetTableTypes, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_table_types not implemented")) } + async fn do_get_sql_info( &self, _query: CommandGetSqlInfo, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_sql_info not implemented")) } + async fn do_get_primary_keys( &self, _query: CommandGetPrimaryKeys, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented("do_get_primary_keys not implemented")) } + async fn do_get_exported_keys( &self, _query: CommandGetExportedKeys, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_exported_keys not implemented", + )) } + async fn do_get_imported_keys( &self, _query: CommandGetImportedKeys, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_imported_keys not implemented", + )) } + async fn do_get_cross_reference( &self, _query: CommandGetCrossReference, + _request: Request, ) -> Result::DoGetStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_get_cross_reference not implemented", + )) } + // do_put async fn do_put_statement_update( &self, _ticket: CommandStatementUpdate, + _request: Request>, ) -> Result { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_put_statement_update not implemented", + )) } + async fn do_put_prepared_statement_query( &self, _query: CommandPreparedStatementQuery, - _request: Streaming, + _request: Request>, ) -> Result::DoPutStream>, Status> { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_put_prepared_statement_query not implemented", + )) } + async fn do_put_prepared_statement_update( &self, _query: CommandPreparedStatementUpdate, - _request: Streaming, + _request: Request>, ) -> Result { - Err(Status::unimplemented("Not yet implemented")) + Err(Status::unimplemented( + "do_put_prepared_statement_update not implemented", + )) } + // do_action async fn do_action_create_prepared_statement( &self, _query: ActionCreatePreparedStatementRequest, + _request: Request, ) -> Result { Err(Status::unimplemented("Not yet implemented")) } async fn do_action_close_prepared_statement( &self, _query: ActionClosePreparedStatementRequest, + _request: Request, ) { unimplemented!("Not yet implemented") } diff --git a/arrow-flight/src/arrow.flight.protocol.rs b/arrow-flight/src/arrow.flight.protocol.rs index c76469b39ce7..2b085d6d1f6b 100644 --- a/arrow-flight/src/arrow.flight.protocol.rs +++ b/arrow-flight/src/arrow.flight.protocol.rs @@ -1,31 +1,31 @@ // This file was automatically generated through the build.rs script, and should not be edited. /// -/// The request that a client provides to a server on handshake. +/// The request that a client provides to a server on handshake. #[derive(Clone, PartialEq, ::prost::Message)] pub struct HandshakeRequest { /// - /// A defined protocol version + /// A defined protocol version #[prost(uint64, tag="1")] pub protocol_version: u64, /// - /// Arbitrary auth/handshake info. + /// Arbitrary auth/handshake info. #[prost(bytes="vec", tag="2")] pub payload: ::prost::alloc::vec::Vec, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct HandshakeResponse { /// - /// A defined protocol version + /// A defined protocol version #[prost(uint64, tag="1")] pub protocol_version: u64, /// - /// Arbitrary auth/handshake info. + /// Arbitrary auth/handshake info. #[prost(bytes="vec", tag="2")] pub payload: ::prost::alloc::vec::Vec, } /// -/// A message for doing simple auth. +/// A message for doing simple auth. #[derive(Clone, PartialEq, ::prost::Message)] pub struct BasicAuth { #[prost(string, tag="2")] @@ -37,8 +37,8 @@ pub struct BasicAuth { pub struct Empty { } /// -/// Describes an available action, including both the name used for execution -/// along with a short description of the purpose of the action. +/// Describes an available action, including both the name used for execution +/// along with a short description of the purpose of the action. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionType { #[prost(string, tag="1")] @@ -47,15 +47,15 @@ pub struct ActionType { pub description: ::prost::alloc::string::String, } /// -/// A service specific expression that can be used to return a limited set -/// of available Arrow Flight streams. +/// A service specific expression that can be used to return a limited set +/// of available Arrow Flight streams. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Criteria { #[prost(bytes="vec", tag="1")] pub expression: ::prost::alloc::vec::Vec, } /// -/// An opaque action specific for the service. +/// An opaque action specific for the service. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Action { #[prost(string, tag="1")] @@ -64,138 +64,151 @@ pub struct Action { pub body: ::prost::alloc::vec::Vec, } /// -/// An opaque result returned after executing an action. +/// An opaque result returned after executing an action. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Result { #[prost(bytes="vec", tag="1")] pub body: ::prost::alloc::vec::Vec, } /// -/// Wrap the result of a getSchema call +/// Wrap the result of a getSchema call #[derive(Clone, PartialEq, ::prost::Message)] pub struct SchemaResult { - /// schema of the dataset as described in Schema.fbs::Schema. + /// schema of the dataset as described in Schema.fbs::Schema. #[prost(bytes="vec", tag="1")] pub schema: ::prost::alloc::vec::Vec, } /// -/// The name or tag for a Flight. May be used as a way to retrieve or generate -/// a flight or be used to expose a set of previously defined flights. +/// The name or tag for a Flight. May be used as a way to retrieve or generate +/// a flight or be used to expose a set of previously defined flights. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightDescriptor { #[prost(enumeration="flight_descriptor::DescriptorType", tag="1")] pub r#type: i32, /// - /// Opaque value used to express a command. Should only be defined when - /// type = CMD. + /// Opaque value used to express a command. Should only be defined when + /// type = CMD. #[prost(bytes="vec", tag="2")] pub cmd: ::prost::alloc::vec::Vec, /// - /// List of strings identifying a particular dataset. Should only be defined - /// when type = PATH. + /// List of strings identifying a particular dataset. Should only be defined + /// when type = PATH. #[prost(string, repeated, tag="3")] pub path: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, } /// Nested message and enum types in `FlightDescriptor`. pub mod flight_descriptor { /// - /// Describes what type of descriptor is defined. + /// Describes what type of descriptor is defined. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum DescriptorType { - /// Protobuf pattern, not used. + /// Protobuf pattern, not used. Unknown = 0, /// - /// A named path that identifies a dataset. A path is composed of a string - /// or list of strings describing a particular dataset. This is conceptually - /// similar to a path inside a filesystem. + /// A named path that identifies a dataset. A path is composed of a string + /// or list of strings describing a particular dataset. This is conceptually + /// similar to a path inside a filesystem. Path = 1, /// - /// An opaque command to generate a dataset. + /// An opaque command to generate a dataset. Cmd = 2, } + impl DescriptorType { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + DescriptorType::Unknown => "UNKNOWN", + DescriptorType::Path => "PATH", + DescriptorType::Cmd => "CMD", + } + } + } } /// -/// The access coordinates for retrieval of a dataset. With a FlightInfo, a -/// consumer is able to determine how to retrieve a dataset. +/// The access coordinates for retrieval of a dataset. With a FlightInfo, a +/// consumer is able to determine how to retrieve a dataset. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightInfo { - /// schema of the dataset as described in Schema.fbs::Schema. + /// schema of the dataset as described in Schema.fbs::Schema. #[prost(bytes="vec", tag="1")] pub schema: ::prost::alloc::vec::Vec, /// - /// The descriptor associated with this info. + /// The descriptor associated with this info. #[prost(message, optional, tag="2")] pub flight_descriptor: ::core::option::Option, /// - /// A list of endpoints associated with the flight. To consume the whole - /// flight, all endpoints must be consumed. + /// A list of endpoints associated with the flight. To consume the whole + /// flight, all endpoints must be consumed. #[prost(message, repeated, tag="3")] pub endpoint: ::prost::alloc::vec::Vec, - /// Set these to -1 if unknown. + /// Set these to -1 if unknown. #[prost(int64, tag="4")] pub total_records: i64, #[prost(int64, tag="5")] pub total_bytes: i64, } /// -/// A particular stream or split associated with a flight. +/// A particular stream or split associated with a flight. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightEndpoint { /// - /// Token used to retrieve this stream. + /// Token used to retrieve this stream. #[prost(message, optional, tag="1")] pub ticket: ::core::option::Option, /// - /// A list of URIs where this ticket can be redeemed. If the list is - /// empty, the expectation is that the ticket can only be redeemed on the - /// current service where the ticket was generated. + /// A list of URIs where this ticket can be redeemed. If the list is + /// empty, the expectation is that the ticket can only be redeemed on the + /// current service where the ticket was generated. #[prost(message, repeated, tag="2")] pub location: ::prost::alloc::vec::Vec, } /// -/// A location where a Flight service will accept retrieval of a particular -/// stream given a ticket. +/// A location where a Flight service will accept retrieval of a particular +/// stream given a ticket. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Location { #[prost(string, tag="1")] pub uri: ::prost::alloc::string::String, } /// -/// An opaque identifier that the service can use to retrieve a particular -/// portion of a stream. +/// An opaque identifier that the service can use to retrieve a particular +/// portion of a stream. #[derive(Clone, PartialEq, ::prost::Message)] pub struct Ticket { #[prost(bytes="vec", tag="1")] pub ticket: ::prost::alloc::vec::Vec, } /// -/// A batch of Arrow data as part of a stream of batches. +/// A batch of Arrow data as part of a stream of batches. #[derive(Clone, PartialEq, ::prost::Message)] pub struct FlightData { /// - /// The descriptor of the data. This is only relevant when a client is - /// starting a new DoPut stream. + /// The descriptor of the data. This is only relevant when a client is + /// starting a new DoPut stream. #[prost(message, optional, tag="1")] pub flight_descriptor: ::core::option::Option, /// - /// Header for message data as described in Message.fbs::Message. + /// Header for message data as described in Message.fbs::Message. #[prost(bytes="vec", tag="2")] pub data_header: ::prost::alloc::vec::Vec, /// - /// Application-defined metadata. + /// Application-defined metadata. #[prost(bytes="vec", tag="3")] pub app_metadata: ::prost::alloc::vec::Vec, /// - /// The actual batch of Arrow data. Preferably handled with minimal-copies - /// coming last in the definition to help with sidecar patterns (it is - /// expected that some implementations will fetch this field off the wire - /// with specialized code to avoid extra memory copies). + /// The actual batch of Arrow data. Preferably handled with minimal-copies + /// coming last in the definition to help with sidecar patterns (it is + /// expected that some implementations will fetch this field off the wire + /// with specialized code to avoid extra memory copies). #[prost(bytes="vec", tag="1000")] pub data_body: ::prost::alloc::vec::Vec, } -///* -/// The response message associated with the submission of a DoPut. +/// * +/// The response message associated with the submission of a DoPut. #[derive(Clone, PartialEq, ::prost::Message)] pub struct PutResult { #[prost(bytes="vec", tag="1")] @@ -205,6 +218,7 @@ pub struct PutResult { pub mod flight_service_client { #![allow(unused_variables, dead_code, missing_docs, clippy::let_unit_value)] use tonic::codegen::*; + use tonic::codegen::http::Uri; /// /// A flight service is an endpoint for retrieving or storing Arrow data. A /// flight service can expose one or more predefined endpoints that can be @@ -236,6 +250,10 @@ pub mod flight_service_client { let inner = tonic::client::Grpc::new(inner); Self { inner } } + pub fn with_origin(inner: T, origin: Uri) -> Self { + let inner = tonic::client::Grpc::with_origin(inner, origin); + Self { inner } + } pub fn with_interceptor( inner: T, interceptor: F, @@ -255,19 +273,19 @@ pub mod flight_service_client { { FlightServiceClient::new(InterceptedService::new(inner, interceptor)) } - /// Compress requests with `gzip`. + /// Compress requests with the given encoding. /// /// This requires the server to support it otherwise it might respond with an /// error. #[must_use] - pub fn send_gzip(mut self) -> Self { - self.inner = self.inner.send_gzip(); + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.send_compressed(encoding); self } - /// Enable decompressing responses with `gzip`. + /// Enable decompressing responses. #[must_use] - pub fn accept_gzip(mut self) -> Self { - self.inner = self.inner.accept_gzip(); + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.inner = self.inner.accept_compressed(encoding); self } /// @@ -672,8 +690,8 @@ pub mod flight_service_server { #[derive(Debug)] pub struct FlightServiceServer { inner: _Inner, - accept_compression_encodings: (), - send_compression_encodings: (), + accept_compression_encodings: EnabledCompressionEncodings, + send_compression_encodings: EnabledCompressionEncodings, } struct _Inner(Arc); impl FlightServiceServer { @@ -697,6 +715,18 @@ pub mod flight_service_server { { InterceptedService::new(Self::new(inner), interceptor) } + /// Enable decompressing requests with the given encoding. + #[must_use] + pub fn accept_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.accept_compression_encodings.enable(encoding); + self + } + /// Compress responses with the given encoding, if the client supports it. + #[must_use] + pub fn send_compressed(mut self, encoding: CompressionEncoding) -> Self { + self.send_compression_encodings.enable(encoding); + self + } } impl tonic::codegen::Service> for FlightServiceServer where @@ -1108,7 +1138,7 @@ pub mod flight_service_server { write!(f, "{:?}", self.0) } } - impl tonic::transport::NamedService for FlightServiceServer { + impl tonic::server::NamedService for FlightServiceServer { const NAME: &'static str = "arrow.flight.protocol.FlightService"; } } diff --git a/arrow-flight/src/lib.rs b/arrow-flight/src/lib.rs index 5cfbd3f60657..3f4f09855353 100644 --- a/arrow-flight/src/lib.rs +++ b/arrow-flight/src/lib.rs @@ -28,6 +28,7 @@ use std::{ ops::Deref, }; +#[allow(clippy::derive_partial_eq_without_eq)] mod gen { include!("arrow.flight.protocol.rs"); } diff --git a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs index ea378a0a2577..77221dd1a489 100644 --- a/arrow-flight/src/sql/arrow.flight.protocol.sql.rs +++ b/arrow-flight/src/sql/arrow.flight.protocol.sql.rs @@ -1,1008 +1,1099 @@ // This file was automatically generated through the build.rs script, and should not be edited. /// -/// Represents a metadata request. Used in the command member of FlightDescriptor -/// for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the metadata request. +/// Represents a metadata request. Used in the command member of FlightDescriptor +/// for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the metadata request. /// -/// The returned Arrow schema will be: -/// < -/// info_name: uint32 not null, -/// value: dense_union< -/// string_value: utf8, -/// bool_value: bool, -/// bigint_value: int64, -/// int32_bitmask: int32, -/// string_list: list -/// int32_to_int32_list_map: map> -/// > -/// where there is one row per requested piece of metadata information. +/// The returned Arrow schema will be: +/// < +/// info_name: uint32 not null, +/// value: dense_union< +/// string_value: utf8, +/// bool_value: bool, +/// bigint_value: int64, +/// int32_bitmask: int32, +/// string_list: list +/// int32_to_int32_list_map: map> +/// > +/// where there is one row per requested piece of metadata information. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetSqlInfo { /// - /// Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide - /// Flight SQL clients with basic, SQL syntax and SQL functions related information. - /// More information types can be added in future releases. - /// E.g. more SQL syntax support types, scalar functions support, type conversion support etc. + /// Values are modelled after ODBC's SQLGetInfo() function. This information is intended to provide + /// Flight SQL clients with basic, SQL syntax and SQL functions related information. + /// More information types can be added in future releases. + /// E.g. more SQL syntax support types, scalar functions support, type conversion support etc. /// - /// Note that the set of metadata may expand. + /// Note that the set of metadata may expand. /// - /// Initially, Flight SQL will support the following information types: - /// - Server Information - Range [0-500) - /// - Syntax Information - Range [500-1000) - /// Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). - /// Custom options should start at 10,000. + /// Initially, Flight SQL will support the following information types: + /// - Server Information - Range [0-500) + /// - Syntax Information - Range [500-1000) + /// Range [0-10,000) is reserved for defaults (see SqlInfo enum for default options). + /// Custom options should start at 10,000. /// - /// If omitted, then all metadata will be retrieved. - /// Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must - /// at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. - /// If additional metadata is included, the metadata IDs should start from 10,000. + /// If omitted, then all metadata will be retrieved. + /// Flight SQL Servers may choose to include additional metadata above and beyond the specified set, however they must + /// at least return the specified set. IDs ranging from 0 to 10,000 (exclusive) are reserved for future use. + /// If additional metadata is included, the metadata IDs should start from 10,000. #[prost(uint32, repeated, tag="1")] pub info: ::prost::alloc::vec::Vec, } /// -/// Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. -/// The definition of a catalog depends on vendor/implementation. It is usually the database itself -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve the list of catalogs on a Flight SQL enabled backend. +/// The definition of a catalog depends on vendor/implementation. It is usually the database itself +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// catalog_name: utf8 not null -/// > -/// The returned data should be ordered by catalog_name. +/// The returned Arrow schema will be: +/// < +/// catalog_name: utf8 not null +/// > +/// The returned data should be ordered by catalog_name. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetCatalogs { } /// -/// Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. -/// The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve the list of database schemas on a Flight SQL enabled backend. +/// The definition of a database schema depends on vendor/implementation. It is usually a collection of tables. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// catalog_name: utf8, -/// db_schema_name: utf8 not null -/// > -/// The returned data should be ordered by catalog_name, then db_schema_name. +/// The returned Arrow schema will be: +/// < +/// catalog_name: utf8, +/// db_schema_name: utf8 not null +/// > +/// The returned data should be ordered by catalog_name, then db_schema_name. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetDbSchemas { /// - /// Specifies the Catalog to search for the tables. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the Catalog to search for the tables. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter pattern for schemas to search for. - /// When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. - /// In the pattern string, two special characters can be used to denote matching rules: - /// - "%" means to match any substring with 0 or more characters. - /// - "_" means to match any one character. + /// Specifies a filter pattern for schemas to search for. + /// When no db_schema_filter_pattern is provided, the pattern will not be used to narrow the search. + /// In the pattern string, two special characters can be used to denote matching rules: + /// - "%" means to match any substring with 0 or more characters. + /// - "_" means to match any one character. #[prost(string, optional, tag="2")] pub db_schema_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, } /// -/// Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve the list of tables, and optionally their schemas, on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// catalog_name: utf8, -/// db_schema_name: utf8, -/// table_name: utf8 not null, -/// table_type: utf8 not null, -/// \[optional\] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, -/// it is serialized as an IPC message.) -/// > -/// The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. +/// The returned Arrow schema will be: +/// < +/// catalog_name: utf8, +/// db_schema_name: utf8, +/// table_name: utf8 not null, +/// table_type: utf8 not null, +/// \[optional\] table_schema: bytes not null (schema of the table as described in Schema.fbs::Schema, +/// it is serialized as an IPC message.) +/// > +/// The returned data should be ordered by catalog_name, db_schema_name, table_name, then table_type, followed by table_schema if requested. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetTables { /// - /// Specifies the Catalog to search for the tables. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the Catalog to search for the tables. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter pattern for schemas to search for. - /// When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. - /// In the pattern string, two special characters can be used to denote matching rules: - /// - "%" means to match any substring with 0 or more characters. - /// - "_" means to match any one character. + /// Specifies a filter pattern for schemas to search for. + /// When no db_schema_filter_pattern is provided, all schemas matching other filters are searched. + /// In the pattern string, two special characters can be used to denote matching rules: + /// - "%" means to match any substring with 0 or more characters. + /// - "_" means to match any one character. #[prost(string, optional, tag="2")] pub db_schema_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter pattern for tables to search for. - /// When no table_name_filter_pattern is provided, all tables matching other filters are searched. - /// In the pattern string, two special characters can be used to denote matching rules: - /// - "%" means to match any substring with 0 or more characters. - /// - "_" means to match any one character. + /// Specifies a filter pattern for tables to search for. + /// When no table_name_filter_pattern is provided, all tables matching other filters are searched. + /// In the pattern string, two special characters can be used to denote matching rules: + /// - "%" means to match any substring with 0 or more characters. + /// - "_" means to match any one character. #[prost(string, optional, tag="3")] pub table_name_filter_pattern: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies a filter of table types which must match. - /// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. - /// TABLE, VIEW, and SYSTEM TABLE are commonly supported. + /// Specifies a filter of table types which must match. + /// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. + /// TABLE, VIEW, and SYSTEM TABLE are commonly supported. #[prost(string, repeated, tag="4")] pub table_types: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - /// Specifies if the Arrow schema should be returned for found tables. + /// Specifies if the Arrow schema should be returned for found tables. #[prost(bool, tag="5")] pub include_schema: bool, } /// -/// Represents a request to retrieve the list of table types on a Flight SQL enabled backend. -/// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. -/// TABLE, VIEW, and SYSTEM TABLE are commonly supported. -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve the list of table types on a Flight SQL enabled backend. +/// The table types depend on vendor/implementation. It is usually used to separate tables from views or system tables. +/// TABLE, VIEW, and SYSTEM TABLE are commonly supported. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// table_type: utf8 not null -/// > -/// The returned data should be ordered by table_type. +/// The returned Arrow schema will be: +/// < +/// table_type: utf8 not null +/// > +/// The returned data should be ordered by table_type. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetTableTypes { } /// -/// Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve the primary keys of a table on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// catalog_name: utf8, -/// db_schema_name: utf8, -/// table_name: utf8 not null, -/// column_name: utf8 not null, -/// key_name: utf8, -/// key_sequence: int not null -/// > -/// The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. +/// The returned Arrow schema will be: +/// < +/// catalog_name: utf8, +/// db_schema_name: utf8, +/// table_name: utf8 not null, +/// column_name: utf8 not null, +/// key_name: utf8, +/// key_sequence: int not null +/// > +/// The returned data should be ordered by catalog_name, db_schema_name, table_name, key_name, then key_sequence. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetPrimaryKeys { /// - /// Specifies the catalog to search for the table. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the catalog to search for the table. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies the schema to search for the table. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// Specifies the schema to search for the table. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, - /// Specifies the table to get the primary keys for. + /// Specifies the table to get the primary keys for. #[prost(string, tag="3")] pub table: ::prost::alloc::string::String, } /// -/// Represents a request to retrieve a description of the foreign key columns that reference the given table's -/// primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve a description of the foreign key columns that reference the given table's +/// primary key columns (the foreign keys exported by a table) of a table on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// pk_catalog_name: utf8, -/// pk_db_schema_name: utf8, -/// pk_table_name: utf8 not null, -/// pk_column_name: utf8 not null, -/// fk_catalog_name: utf8, -/// fk_db_schema_name: utf8, -/// fk_table_name: utf8 not null, -/// fk_column_name: utf8 not null, -/// key_sequence: int not null, -/// fk_key_name: utf8, -/// pk_key_name: utf8, -/// update_rule: uint1 not null, -/// delete_rule: uint1 not null -/// > -/// The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. -/// update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. +/// The returned Arrow schema will be: +/// < +/// pk_catalog_name: utf8, +/// pk_db_schema_name: utf8, +/// pk_table_name: utf8 not null, +/// pk_column_name: utf8 not null, +/// fk_catalog_name: utf8, +/// fk_db_schema_name: utf8, +/// fk_table_name: utf8 not null, +/// fk_column_name: utf8 not null, +/// key_sequence: int not null, +/// fk_key_name: utf8, +/// pk_key_name: utf8, +/// update_rule: uint1 not null, +/// delete_rule: uint1 not null +/// > +/// The returned data should be ordered by fk_catalog_name, fk_db_schema_name, fk_table_name, fk_key_name, then key_sequence. +/// update_rule and delete_rule returns a byte that is equivalent to actions declared on UpdateDeleteRules enum. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetExportedKeys { /// - /// Specifies the catalog to search for the foreign key table. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the catalog to search for the foreign key table. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies the schema to search for the foreign key table. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// Specifies the schema to search for the foreign key table. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, - /// Specifies the foreign key table to get the foreign keys for. + /// Specifies the foreign key table to get the foreign keys for. #[prost(string, tag="3")] pub table: ::prost::alloc::string::String, } /// -/// Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve the foreign keys of a table on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// pk_catalog_name: utf8, -/// pk_db_schema_name: utf8, -/// pk_table_name: utf8 not null, -/// pk_column_name: utf8 not null, -/// fk_catalog_name: utf8, -/// fk_db_schema_name: utf8, -/// fk_table_name: utf8 not null, -/// fk_column_name: utf8 not null, -/// key_sequence: int not null, -/// fk_key_name: utf8, -/// pk_key_name: utf8, -/// update_rule: uint1 not null, -/// delete_rule: uint1 not null -/// > -/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. -/// update_rule and delete_rule returns a byte that is equivalent to actions: -/// - 0 = CASCADE -/// - 1 = RESTRICT -/// - 2 = SET NULL -/// - 3 = NO ACTION -/// - 4 = SET DEFAULT +/// The returned Arrow schema will be: +/// < +/// pk_catalog_name: utf8, +/// pk_db_schema_name: utf8, +/// pk_table_name: utf8 not null, +/// pk_column_name: utf8 not null, +/// fk_catalog_name: utf8, +/// fk_db_schema_name: utf8, +/// fk_table_name: utf8 not null, +/// fk_column_name: utf8 not null, +/// key_sequence: int not null, +/// fk_key_name: utf8, +/// pk_key_name: utf8, +/// update_rule: uint1 not null, +/// delete_rule: uint1 not null +/// > +/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. +/// update_rule and delete_rule returns a byte that is equivalent to actions: +/// - 0 = CASCADE +/// - 1 = RESTRICT +/// - 2 = SET NULL +/// - 3 = NO ACTION +/// - 4 = SET DEFAULT #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetImportedKeys { /// - /// Specifies the catalog to search for the primary key table. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// Specifies the catalog to search for the primary key table. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub catalog: ::core::option::Option<::prost::alloc::string::String>, /// - /// Specifies the schema to search for the primary key table. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// Specifies the schema to search for the primary key table. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub db_schema: ::core::option::Option<::prost::alloc::string::String>, - /// Specifies the primary key table to get the foreign keys for. + /// Specifies the primary key table to get the foreign keys for. #[prost(string, tag="3")] pub table: ::prost::alloc::string::String, } /// -/// Represents a request to retrieve a description of the foreign key columns in the given foreign key table that -/// reference the primary key or the columns representing a unique constraint of the parent table (could be the same -/// or a different table) on a Flight SQL enabled backend. -/// Used in the command member of FlightDescriptor for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the catalog metadata request. +/// Represents a request to retrieve a description of the foreign key columns in the given foreign key table that +/// reference the primary key or the columns representing a unique constraint of the parent table (could be the same +/// or a different table) on a Flight SQL enabled backend. +/// Used in the command member of FlightDescriptor for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the catalog metadata request. /// -/// The returned Arrow schema will be: -/// < -/// pk_catalog_name: utf8, -/// pk_db_schema_name: utf8, -/// pk_table_name: utf8 not null, -/// pk_column_name: utf8 not null, -/// fk_catalog_name: utf8, -/// fk_db_schema_name: utf8, -/// fk_table_name: utf8 not null, -/// fk_column_name: utf8 not null, -/// key_sequence: int not null, -/// fk_key_name: utf8, -/// pk_key_name: utf8, -/// update_rule: uint1 not null, -/// delete_rule: uint1 not null -/// > -/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. -/// update_rule and delete_rule returns a byte that is equivalent to actions: -/// - 0 = CASCADE -/// - 1 = RESTRICT -/// - 2 = SET NULL -/// - 3 = NO ACTION -/// - 4 = SET DEFAULT +/// The returned Arrow schema will be: +/// < +/// pk_catalog_name: utf8, +/// pk_db_schema_name: utf8, +/// pk_table_name: utf8 not null, +/// pk_column_name: utf8 not null, +/// fk_catalog_name: utf8, +/// fk_db_schema_name: utf8, +/// fk_table_name: utf8 not null, +/// fk_column_name: utf8 not null, +/// key_sequence: int not null, +/// fk_key_name: utf8, +/// pk_key_name: utf8, +/// update_rule: uint1 not null, +/// delete_rule: uint1 not null +/// > +/// The returned data should be ordered by pk_catalog_name, pk_db_schema_name, pk_table_name, pk_key_name, then key_sequence. +/// update_rule and delete_rule returns a byte that is equivalent to actions: +/// - 0 = CASCADE +/// - 1 = RESTRICT +/// - 2 = SET NULL +/// - 3 = NO ACTION +/// - 4 = SET DEFAULT #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandGetCrossReference { - ///* - /// The catalog name where the parent table is. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// * + /// The catalog name where the parent table is. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="1")] pub pk_catalog: ::core::option::Option<::prost::alloc::string::String>, - ///* - /// The Schema name where the parent table is. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// * + /// The Schema name where the parent table is. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="2")] pub pk_db_schema: ::core::option::Option<::prost::alloc::string::String>, - ///* - /// The parent table name. It cannot be null. + /// * + /// The parent table name. It cannot be null. #[prost(string, tag="3")] pub pk_table: ::prost::alloc::string::String, - ///* - /// The catalog name where the foreign table is. - /// An empty string retrieves those without a catalog. - /// If omitted the catalog name should not be used to narrow the search. + /// * + /// The catalog name where the foreign table is. + /// An empty string retrieves those without a catalog. + /// If omitted the catalog name should not be used to narrow the search. #[prost(string, optional, tag="4")] pub fk_catalog: ::core::option::Option<::prost::alloc::string::String>, - ///* - /// The schema name where the foreign table is. - /// An empty string retrieves those without a schema. - /// If omitted the schema name should not be used to narrow the search. + /// * + /// The schema name where the foreign table is. + /// An empty string retrieves those without a schema. + /// If omitted the schema name should not be used to narrow the search. #[prost(string, optional, tag="5")] pub fk_db_schema: ::core::option::Option<::prost::alloc::string::String>, - ///* - /// The foreign table name. It cannot be null. + /// * + /// The foreign table name. It cannot be null. #[prost(string, tag="6")] pub fk_table: ::prost::alloc::string::String, } -// SQL Execution Action Messages +// SQL Execution Action Messages /// -/// Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. +/// Request message for the "CreatePreparedStatement" action on a Flight SQL enabled backend. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementRequest { - /// The valid SQL string to create a prepared statement for. + /// The valid SQL string to create a prepared statement for. #[prost(string, tag="1")] pub query: ::prost::alloc::string::String, } /// -/// Wrap the result of a "GetPreparedStatement" action. +/// Wrap the result of a "GetPreparedStatement" action. /// -/// The resultant PreparedStatement can be closed either: -/// - Manually, through the "ClosePreparedStatement" action; -/// - Automatically, by a server timeout. +/// The resultant PreparedStatement can be closed either: +/// - Manually, through the "ClosePreparedStatement" action; +/// - Automatically, by a server timeout. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionCreatePreparedStatementResult { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, - /// If a result set generating query was provided, dataset_schema contains the - /// schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. + /// If a result set generating query was provided, dataset_schema contains the + /// schema of the dataset as described in Schema.fbs::Schema, it is serialized as an IPC message. #[prost(bytes="vec", tag="2")] pub dataset_schema: ::prost::alloc::vec::Vec, - /// If the query provided contained parameters, parameter_schema contains the - /// schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. + /// If the query provided contained parameters, parameter_schema contains the + /// schema of the expected parameters as described in Schema.fbs::Schema, it is serialized as an IPC message. #[prost(bytes="vec", tag="3")] pub parameter_schema: ::prost::alloc::vec::Vec, } /// -/// Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. -/// Closes server resources associated with the prepared statement handle. +/// Request message for the "ClosePreparedStatement" action on a Flight SQL enabled backend. +/// Closes server resources associated with the prepared statement handle. #[derive(Clone, PartialEq, ::prost::Message)] pub struct ActionClosePreparedStatementRequest { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } -// SQL Execution Messages. +// SQL Execution Messages. /// -/// Represents a SQL query. Used in the command member of FlightDescriptor -/// for the following RPC calls: -/// - GetSchema: return the Arrow schema of the query. -/// - GetFlightInfo: execute the query. +/// Represents a SQL query. Used in the command member of FlightDescriptor +/// for the following RPC calls: +/// - GetSchema: return the Arrow schema of the query. +/// - GetFlightInfo: execute the query. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementQuery { - /// The SQL syntax. + /// The SQL syntax. #[prost(string, tag="1")] pub query: ::prost::alloc::string::String, } -///* -/// Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. -/// This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. +/// * +/// Represents a ticket resulting from GetFlightInfo with a CommandStatementQuery. +/// This should be used only once and treated as an opaque value, that is, clients should not attempt to parse this. #[derive(Clone, PartialEq, ::prost::Message)] pub struct TicketStatementQuery { - /// Unique identifier for the instance of the statement to execute. + /// Unique identifier for the instance of the statement to execute. #[prost(bytes="vec", tag="1")] pub statement_handle: ::prost::alloc::vec::Vec, } /// -/// Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for -/// the following RPC calls: -/// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. -/// - GetFlightInfo: execute the prepared statement instance. +/// Represents an instance of executing a prepared statement. Used in the command member of FlightDescriptor for +/// the following RPC calls: +/// - DoPut: bind parameter values. All of the bound parameter sets will be executed as a single atomic execution. +/// - GetFlightInfo: execute the prepared statement instance. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementQuery { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } /// -/// Represents a SQL update query. Used in the command member of FlightDescriptor -/// for the the RPC call DoPut to cause the server to execute the included SQL update. +/// Represents a SQL update query. Used in the command member of FlightDescriptor +/// for the the RPC call DoPut to cause the server to execute the included SQL update. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandStatementUpdate { - /// The SQL syntax. + /// The SQL syntax. #[prost(string, tag="1")] pub query: ::prost::alloc::string::String, } /// -/// Represents a SQL update query. Used in the command member of FlightDescriptor -/// for the the RPC call DoPut to cause the server to execute the included -/// prepared statement handle as an update. +/// Represents a SQL update query. Used in the command member of FlightDescriptor +/// for the the RPC call DoPut to cause the server to execute the included +/// prepared statement handle as an update. #[derive(Clone, PartialEq, ::prost::Message)] pub struct CommandPreparedStatementUpdate { - /// Opaque handle for the prepared statement on the server. + /// Opaque handle for the prepared statement on the server. #[prost(bytes="vec", tag="1")] pub prepared_statement_handle: ::prost::alloc::vec::Vec, } /// -/// Returned from the RPC call DoPut when a CommandStatementUpdate -/// CommandPreparedStatementUpdate was in the request, containing -/// results from the update. +/// Returned from the RPC call DoPut when a CommandStatementUpdate +/// CommandPreparedStatementUpdate was in the request, containing +/// results from the update. #[derive(Clone, PartialEq, ::prost::Message)] pub struct DoPutUpdateResult { - /// The number of records updated. A return value of -1 represents - /// an unknown updated record count. + /// The number of records updated. A return value of -1 represents + /// an unknown updated record count. #[prost(int64, tag="1")] pub record_count: i64, } -/// Options for CommandGetSqlInfo. +/// Options for CommandGetSqlInfo. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlInfo { - // Server Information [0-500): Provides basic information about the Flight SQL Server. + // Server Information [0-500): Provides basic information about the Flight SQL Server. - /// Retrieves a UTF-8 string with the name of the Flight SQL Server. + /// Retrieves a UTF-8 string with the name of the Flight SQL Server. FlightSqlServerName = 0, - /// Retrieves a UTF-8 string with the native version of the Flight SQL Server. + /// Retrieves a UTF-8 string with the native version of the Flight SQL Server. FlightSqlServerVersion = 1, - /// Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. + /// Retrieves a UTF-8 string with the Arrow format version of the Flight SQL Server. FlightSqlServerArrowVersion = 2, - /// - /// Retrieves a boolean value indicating whether the Flight SQL Server is read only. + /// + /// Retrieves a boolean value indicating whether the Flight SQL Server is read only. /// - /// Returns: - /// - false: if read-write - /// - true: if read only + /// Returns: + /// - false: if read-write + /// - true: if read only FlightSqlServerReadOnly = 3, - // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. + // SQL Syntax Information [500-1000): provides information about SQL syntax supported by the Flight SQL Server. /// - /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. + /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of catalogs. /// - /// Returns: - /// - false: if it doesn't support CREATE and DROP of catalogs. - /// - true: if it supports CREATE and DROP of catalogs. + /// Returns: + /// - false: if it doesn't support CREATE and DROP of catalogs. + /// - true: if it supports CREATE and DROP of catalogs. SqlDdlCatalog = 500, /// - /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. + /// Retrieves a boolean value indicating whether the Flight SQL Server supports CREATE and DROP of schemas. /// - /// Returns: - /// - false: if it doesn't support CREATE and DROP of schemas. - /// - true: if it supports CREATE and DROP of schemas. + /// Returns: + /// - false: if it doesn't support CREATE and DROP of schemas. + /// - true: if it supports CREATE and DROP of schemas. SqlDdlSchema = 501, /// - /// Indicates whether the Flight SQL Server supports CREATE and DROP of tables. + /// Indicates whether the Flight SQL Server supports CREATE and DROP of tables. /// - /// Returns: - /// - false: if it doesn't support CREATE and DROP of tables. - /// - true: if it supports CREATE and DROP of tables. + /// Returns: + /// - false: if it doesn't support CREATE and DROP of tables. + /// - true: if it supports CREATE and DROP of tables. SqlDdlTable = 502, /// - /// Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of catalog, table, schema and table names. + /// Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of catalog, table, schema and table names. /// - /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlIdentifierCase = 503, - /// Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. + /// Retrieves a UTF-8 string with the supported character(s) used to surround a delimited identifier. SqlIdentifierQuoteChar = 504, /// - /// Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of quoted identifiers. + /// Retrieves a uint32 value representing the enu uint32 ordinal for the case sensitivity of quoted identifiers. /// - /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. + /// The possible values are listed in `arrow.flight.protocol.sql.SqlSupportedCaseSensitivity`. SqlQuotedIdentifierCase = 505, /// - /// Retrieves a boolean value indicating whether all tables are selectable. + /// Retrieves a boolean value indicating whether all tables are selectable. /// - /// Returns: - /// - false: if not all tables are selectable or if none are; - /// - true: if all tables are selectable. + /// Returns: + /// - false: if not all tables are selectable or if none are; + /// - true: if all tables are selectable. SqlAllTablesAreSelectable = 506, /// - /// Retrieves the null ordering. + /// Retrieves the null ordering. /// - /// Returns a uint32 ordinal for the null ordering being used, as described in - /// `arrow.flight.protocol.sql.SqlNullOrdering`. + /// Returns a uint32 ordinal for the null ordering being used, as described in + /// `arrow.flight.protocol.sql.SqlNullOrdering`. SqlNullOrdering = 507, - /// Retrieves a UTF-8 string list with values of the supported keywords. + /// Retrieves a UTF-8 string list with values of the supported keywords. SqlKeywords = 508, - /// Retrieves a UTF-8 string list with values of the supported numeric functions. + /// Retrieves a UTF-8 string list with values of the supported numeric functions. SqlNumericFunctions = 509, - /// Retrieves a UTF-8 string list with values of the supported string functions. + /// Retrieves a UTF-8 string list with values of the supported string functions. SqlStringFunctions = 510, - /// Retrieves a UTF-8 string list with values of the supported system functions. + /// Retrieves a UTF-8 string list with values of the supported system functions. SqlSystemFunctions = 511, - /// Retrieves a UTF-8 string list with values of the supported datetime functions. + /// Retrieves a UTF-8 string list with values of the supported datetime functions. SqlDatetimeFunctions = 512, /// - /// Retrieves the UTF-8 string that can be used to escape wildcard characters. - /// This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern - /// (and therefore use one of the wildcard characters). - /// The '_' character represents any single character; the '%' character represents any sequence of zero or more - /// characters. + /// Retrieves the UTF-8 string that can be used to escape wildcard characters. + /// This is the string that can be used to escape '_' or '%' in the catalog search parameters that are a pattern + /// (and therefore use one of the wildcard characters). + /// The '_' character represents any single character; the '%' character represents any sequence of zero or more + /// characters. SqlSearchStringEscape = 513, /// - /// Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names - /// (those beyond a-z, A-Z, 0-9 and _). + /// Retrieves a UTF-8 string with all the "extra" characters that can be used in unquoted identifier names + /// (those beyond a-z, A-Z, 0-9 and _). SqlExtraNameCharacters = 514, /// - /// Retrieves a boolean value indicating whether column aliasing is supported. - /// If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns - /// as required. + /// Retrieves a boolean value indicating whether column aliasing is supported. + /// If so, the SQL AS clause can be used to provide names for computed columns or to provide alias names for columns + /// as required. /// - /// Returns: - /// - false: if column aliasing is unsupported; - /// - true: if column aliasing is supported. + /// Returns: + /// - false: if column aliasing is unsupported; + /// - true: if column aliasing is supported. SqlSupportsColumnAliasing = 515, /// - /// Retrieves a boolean value indicating whether concatenations between null and non-null values being - /// null are supported. + /// Retrieves a boolean value indicating whether concatenations between null and non-null values being + /// null are supported. /// - /// - Returns: - /// - false: if concatenations between null and non-null values being null are unsupported; - /// - true: if concatenations between null and non-null values being null are supported. + /// - Returns: + /// - false: if concatenations between null and non-null values being null are unsupported; + /// - true: if concatenations between null and non-null values being null are supported. SqlNullPlusNullIsNull = 516, /// - /// Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, - /// indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on - /// SqlSupportsConvert enum. - /// The returned map will be: map> + /// Retrieves a map where the key is the type to convert from and the value is a list with the types to convert to, + /// indicating the supported conversions. Each key and each item on the list value is a value to a predefined type on + /// SqlSupportsConvert enum. + /// The returned map will be: map> SqlSupportsConvert = 517, /// - /// Retrieves a boolean value indicating whether, when table correlation names are supported, - /// they are restricted to being different from the names of the tables. + /// Retrieves a boolean value indicating whether, when table correlation names are supported, + /// they are restricted to being different from the names of the tables. /// - /// Returns: - /// - false: if table correlation names are unsupported; - /// - true: if table correlation names are supported. + /// Returns: + /// - false: if table correlation names are unsupported; + /// - true: if table correlation names are supported. SqlSupportsTableCorrelationNames = 518, /// - /// Retrieves a boolean value indicating whether, when table correlation names are supported, - /// they are restricted to being different from the names of the tables. + /// Retrieves a boolean value indicating whether, when table correlation names are supported, + /// they are restricted to being different from the names of the tables. /// - /// Returns: - /// - false: if different table correlation names are unsupported; - /// - true: if different table correlation names are supported + /// Returns: + /// - false: if different table correlation names are unsupported; + /// - true: if different table correlation names are supported SqlSupportsDifferentTableCorrelationNames = 519, /// - /// Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. + /// Retrieves a boolean value indicating whether expressions in ORDER BY lists are supported. /// - /// Returns: - /// - false: if expressions in ORDER BY are unsupported; - /// - true: if expressions in ORDER BY are supported; + /// Returns: + /// - false: if expressions in ORDER BY are unsupported; + /// - true: if expressions in ORDER BY are supported; SqlSupportsExpressionsInOrderBy = 520, /// - /// Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY - /// clause is supported. + /// Retrieves a boolean value indicating whether using a column that is not in the SELECT statement in a GROUP BY + /// clause is supported. /// - /// Returns: - /// - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; - /// - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. + /// Returns: + /// - false: if using a column that is not in the SELECT statement in a GROUP BY clause is unsupported; + /// - true: if using a column that is not in the SELECT statement in a GROUP BY clause is supported. SqlSupportsOrderByUnrelated = 521, /// - /// Retrieves the supported GROUP BY commands; + /// Retrieves the supported GROUP BY commands; /// - /// Returns an int32 bitmask value representing the supported commands. - /// The returned bitmask should be parsed in order to retrieve the supported commands. + /// Returns an int32 bitmask value representing the supported commands. + /// The returned bitmask should be parsed in order to retrieve the supported commands. /// - /// For instance: - /// - return 0 (\b0) => [] (GROUP BY is unsupported); - /// - return 1 (\b1) => \[SQL_GROUP_BY_UNRELATED\]; - /// - return 2 (\b10) => \[SQL_GROUP_BY_BEYOND_SELECT\]; - /// - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. - /// Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. + /// For instance: + /// - return 0 (\b0) => [] (GROUP BY is unsupported); + /// - return 1 (\b1) => \[SQL_GROUP_BY_UNRELATED\]; + /// - return 2 (\b10) => \[SQL_GROUP_BY_BEYOND_SELECT\]; + /// - return 3 (\b11) => [SQL_GROUP_BY_UNRELATED, SQL_GROUP_BY_BEYOND_SELECT]. + /// Valid GROUP BY types are described under `arrow.flight.protocol.sql.SqlSupportedGroupBy`. SqlSupportedGroupBy = 522, /// - /// Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. + /// Retrieves a boolean value indicating whether specifying a LIKE escape clause is supported. /// - /// Returns: - /// - false: if specifying a LIKE escape clause is unsupported; - /// - true: if specifying a LIKE escape clause is supported. + /// Returns: + /// - false: if specifying a LIKE escape clause is unsupported; + /// - true: if specifying a LIKE escape clause is supported. SqlSupportsLikeEscapeClause = 523, /// - /// Retrieves a boolean value indicating whether columns may be defined as non-nullable. + /// Retrieves a boolean value indicating whether columns may be defined as non-nullable. /// - /// Returns: - /// - false: if columns cannot be defined as non-nullable; - /// - true: if columns may be defined as non-nullable. + /// Returns: + /// - false: if columns cannot be defined as non-nullable; + /// - true: if columns may be defined as non-nullable. SqlSupportsNonNullableColumns = 524, /// - /// Retrieves the supported SQL grammar level as per the ODBC specification. - /// - /// Returns an int32 bitmask value representing the supported SQL grammar level. - /// The returned bitmask should be parsed in order to retrieve the supported grammar levels. - /// - /// For instance: - /// - return 0 (\b0) => [] (SQL grammar is unsupported); - /// - return 1 (\b1) => \[SQL_MINIMUM_GRAMMAR\]; - /// - return 2 (\b10) => \[SQL_CORE_GRAMMAR\]; - /// - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; - /// - return 4 (\b100) => \[SQL_EXTENDED_GRAMMAR\]; - /// - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; - /// - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. - /// Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. + /// Retrieves the supported SQL grammar level as per the ODBC specification. + /// + /// Returns an int32 bitmask value representing the supported SQL grammar level. + /// The returned bitmask should be parsed in order to retrieve the supported grammar levels. + /// + /// For instance: + /// - return 0 (\b0) => [] (SQL grammar is unsupported); + /// - return 1 (\b1) => \[SQL_MINIMUM_GRAMMAR\]; + /// - return 2 (\b10) => \[SQL_CORE_GRAMMAR\]; + /// - return 3 (\b11) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR]; + /// - return 4 (\b100) => \[SQL_EXTENDED_GRAMMAR\]; + /// - return 5 (\b101) => [SQL_MINIMUM_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + /// - return 6 (\b110) => [SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]; + /// - return 7 (\b111) => [SQL_MINIMUM_GRAMMAR, SQL_CORE_GRAMMAR, SQL_EXTENDED_GRAMMAR]. + /// Valid SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedSqlGrammar`. SqlSupportedGrammar = 525, /// - /// Retrieves the supported ANSI92 SQL grammar level. - /// - /// Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. - /// The returned bitmask should be parsed in order to retrieve the supported commands. - /// - /// For instance: - /// - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); - /// - return 1 (\b1) => \[ANSI92_ENTRY_SQL\]; - /// - return 2 (\b10) => \[ANSI92_INTERMEDIATE_SQL\]; - /// - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; - /// - return 4 (\b100) => \[ANSI92_FULL_SQL\]; - /// - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; - /// - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; - /// - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. - /// Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. + /// Retrieves the supported ANSI92 SQL grammar level. + /// + /// Returns an int32 bitmask value representing the supported ANSI92 SQL grammar level. + /// The returned bitmask should be parsed in order to retrieve the supported commands. + /// + /// For instance: + /// - return 0 (\b0) => [] (ANSI92 SQL grammar is unsupported); + /// - return 1 (\b1) => \[ANSI92_ENTRY_SQL\]; + /// - return 2 (\b10) => \[ANSI92_INTERMEDIATE_SQL\]; + /// - return 3 (\b11) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL]; + /// - return 4 (\b100) => \[ANSI92_FULL_SQL\]; + /// - return 5 (\b101) => [ANSI92_ENTRY_SQL, ANSI92_FULL_SQL]; + /// - return 6 (\b110) => [ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]; + /// - return 7 (\b111) => [ANSI92_ENTRY_SQL, ANSI92_INTERMEDIATE_SQL, ANSI92_FULL_SQL]. + /// Valid ANSI92 SQL grammar levels are described under `arrow.flight.protocol.sql.SupportedAnsi92SqlGrammarLevel`. SqlAnsi92SupportedLevel = 526, /// - /// Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. + /// Retrieves a boolean value indicating whether the SQL Integrity Enhancement Facility is supported. /// - /// Returns: - /// - false: if the SQL Integrity Enhancement Facility is supported; - /// - true: if the SQL Integrity Enhancement Facility is supported. + /// Returns: + /// - false: if the SQL Integrity Enhancement Facility is supported; + /// - true: if the SQL Integrity Enhancement Facility is supported. SqlSupportsIntegrityEnhancementFacility = 527, /// - /// Retrieves the support level for SQL OUTER JOINs. + /// Retrieves the support level for SQL OUTER JOINs. /// - /// Returns a uint3 uint32 ordinal for the SQL ordering being used, as described in - /// `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. + /// Returns a uint3 uint32 ordinal for the SQL ordering being used, as described in + /// `arrow.flight.protocol.sql.SqlOuterJoinsSupportLevel`. SqlOuterJoinsSupportLevel = 528, - /// Retrieves a UTF-8 string with the preferred term for "schema". + /// Retrieves a UTF-8 string with the preferred term for "schema". SqlSchemaTerm = 529, - /// Retrieves a UTF-8 string with the preferred term for "procedure". + /// Retrieves a UTF-8 string with the preferred term for "procedure". SqlProcedureTerm = 530, - /// Retrieves a UTF-8 string with the preferred term for "catalog". + /// Retrieves a UTF-8 string with the preferred term for "catalog". SqlCatalogTerm = 531, /// - /// Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. + /// Retrieves a boolean value indicating whether a catalog appears at the start of a fully qualified table name. /// - /// - false: if a catalog does not appear at the start of a fully qualified table name; - /// - true: if a catalog appears at the start of a fully qualified table name. + /// - false: if a catalog does not appear at the start of a fully qualified table name; + /// - true: if a catalog appears at the start of a fully qualified table name. SqlCatalogAtStart = 532, /// - /// Retrieves the supported actions for a SQL schema. - /// - /// Returns an int32 bitmask value representing the supported actions for a SQL schema. - /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL schema); - /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; - /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. - /// Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + /// Retrieves the supported actions for a SQL schema. + /// + /// Returns an int32 bitmask value representing the supported actions for a SQL schema. + /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL schema. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported actions for SQL schema); + /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; + /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; + /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// Valid actions for a SQL schema described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlSchemasSupportedActions = 533, /// - /// Retrieves the supported actions for a SQL schema. - /// - /// Returns an int32 bitmask value representing the supported actions for a SQL catalog. - /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported actions for SQL catalog); - /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; - /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; - /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; - /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; - /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; - /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. - /// Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. + /// Retrieves the supported actions for a SQL schema. + /// + /// Returns an int32 bitmask value representing the supported actions for a SQL catalog. + /// The returned bitmask should be parsed in order to retrieve the supported actions for a SQL catalog. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported actions for SQL catalog); + /// - return 1 (\b1) => \[SQL_ELEMENT_IN_PROCEDURE_CALLS\]; + /// - return 2 (\b10) => \[SQL_ELEMENT_IN_INDEX_DEFINITIONS\]; + /// - return 3 (\b11) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS]; + /// - return 4 (\b100) => \[SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS\]; + /// - return 5 (\b101) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 6 (\b110) => [SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]; + /// - return 7 (\b111) => [SQL_ELEMENT_IN_PROCEDURE_CALLS, SQL_ELEMENT_IN_INDEX_DEFINITIONS, SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS]. + /// Valid actions for a SQL catalog are described under `arrow.flight.protocol.sql.SqlSupportedElementActions`. SqlCatalogsSupportedActions = 534, /// - /// Retrieves the supported SQL positioned commands. + /// Retrieves the supported SQL positioned commands. /// - /// Returns an int32 bitmask value representing the supported SQL positioned commands. - /// The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. + /// Returns an int32 bitmask value representing the supported SQL positioned commands. + /// The returned bitmask should be parsed in order to retrieve the supported SQL positioned commands. /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); - /// - return 1 (\b1) => \[SQL_POSITIONED_DELETE\]; - /// - return 2 (\b10) => \[SQL_POSITIONED_UPDATE\]; - /// - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. - /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 1 (\b1) => \[SQL_POSITIONED_DELETE\]; + /// - return 2 (\b10) => \[SQL_POSITIONED_UPDATE\]; + /// - return 3 (\b11) => [SQL_POSITIONED_DELETE, SQL_POSITIONED_UPDATE]. + /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedPositionedCommands`. SqlSupportedPositionedCommands = 535, /// - /// Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. + /// Retrieves a boolean value indicating whether SELECT FOR UPDATE statements are supported. /// - /// Returns: - /// - false: if SELECT FOR UPDATE statements are unsupported; - /// - true: if SELECT FOR UPDATE statements are supported. + /// Returns: + /// - false: if SELECT FOR UPDATE statements are unsupported; + /// - true: if SELECT FOR UPDATE statements are supported. SqlSelectForUpdateSupported = 536, /// - /// Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax - /// are supported. + /// Retrieves a boolean value indicating whether stored procedure calls that use the stored procedure escape syntax + /// are supported. /// - /// Returns: - /// - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; - /// - true: if stored procedure calls that use the stored procedure escape syntax are supported. + /// Returns: + /// - false: if stored procedure calls that use the stored procedure escape syntax are unsupported; + /// - true: if stored procedure calls that use the stored procedure escape syntax are supported. SqlStoredProceduresSupported = 537, /// - /// Retrieves the supported SQL subqueries. - /// - /// Returns an int32 bitmask value representing the supported SQL subqueries. - /// The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL subqueries); - /// - return 1 (\b1) => \[SQL_SUBQUERIES_IN_COMPARISONS\]; - /// - return 2 (\b10) => \[SQL_SUBQUERIES_IN_EXISTS\]; - /// - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; - /// - return 4 (\b100) => \[SQL_SUBQUERIES_IN_INS\]; - /// - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; - /// - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; - /// - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; - /// - return 8 (\b1000) => \[SQL_SUBQUERIES_IN_QUANTIFIEDS\]; - /// - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; - /// - ... - /// Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. + /// Retrieves the supported SQL subqueries. + /// + /// Returns an int32 bitmask value representing the supported SQL subqueries. + /// The returned bitmask should be parsed in order to retrieve the supported SQL subqueries. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL subqueries); + /// - return 1 (\b1) => \[SQL_SUBQUERIES_IN_COMPARISONS\]; + /// - return 2 (\b10) => \[SQL_SUBQUERIES_IN_EXISTS\]; + /// - return 3 (\b11) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS]; + /// - return 4 (\b100) => \[SQL_SUBQUERIES_IN_INS\]; + /// - return 5 (\b101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS]; + /// - return 6 (\b110) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_EXISTS]; + /// - return 7 (\b111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS]; + /// - return 8 (\b1000) => \[SQL_SUBQUERIES_IN_QUANTIFIEDS\]; + /// - return 9 (\b1001) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 10 (\b1010) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 11 (\b1011) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 12 (\b1100) => [SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 13 (\b1101) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 14 (\b1110) => [SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - return 15 (\b1111) => [SQL_SUBQUERIES_IN_COMPARISONS, SQL_SUBQUERIES_IN_EXISTS, SQL_SUBQUERIES_IN_INS, SQL_SUBQUERIES_IN_QUANTIFIEDS]; + /// - ... + /// Valid SQL subqueries are described under `arrow.flight.protocol.sql.SqlSupportedSubqueries`. SqlSupportedSubqueries = 538, /// - /// Retrieves a boolean value indicating whether correlated subqueries are supported. + /// Retrieves a boolean value indicating whether correlated subqueries are supported. /// - /// Returns: - /// - false: if correlated subqueries are unsupported; - /// - true: if correlated subqueries are supported. + /// Returns: + /// - false: if correlated subqueries are unsupported; + /// - true: if correlated subqueries are supported. SqlCorrelatedSubqueriesSupported = 539, /// - /// Retrieves the supported SQL UNIONs. + /// Retrieves the supported SQL UNIONs. /// - /// Returns an int32 bitmask value representing the supported SQL UNIONs. - /// The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. + /// Returns an int32 bitmask value representing the supported SQL UNIONs. + /// The returned bitmask should be parsed in order to retrieve the supported SQL UNIONs. /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL positioned commands); - /// - return 1 (\b1) => \[SQL_UNION\]; - /// - return 2 (\b10) => \[SQL_UNION_ALL\]; - /// - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. - /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL positioned commands); + /// - return 1 (\b1) => \[SQL_UNION\]; + /// - return 2 (\b10) => \[SQL_UNION_ALL\]; + /// - return 3 (\b11) => [SQL_UNION, SQL_UNION_ALL]. + /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlSupportedUnions`. SqlSupportedUnions = 540, - /// Retrieves a uint32 value representing the maximum number of hex characters allowed in an inline binary literal. + /// Retrieves a uint32 value representing the maximum number of hex characters allowed in an inline binary literal. SqlMaxBinaryLiteralLength = 541, - /// Retrieves a uint32 value representing the maximum number of characters allowed for a character literal. + /// Retrieves a uint32 value representing the maximum number of characters allowed for a character literal. SqlMaxCharLiteralLength = 542, - /// Retrieves a uint32 value representing the maximum number of characters allowed for a column name. + /// Retrieves a uint32 value representing the maximum number of characters allowed for a column name. SqlMaxColumnNameLength = 543, - /// Retrieves a uint32 value representing the the maximum number of columns allowed in a GROUP BY clause. + /// Retrieves a uint32 value representing the the maximum number of columns allowed in a GROUP BY clause. SqlMaxColumnsInGroupBy = 544, - /// Retrieves a uint32 value representing the maximum number of columns allowed in an index. + /// Retrieves a uint32 value representing the maximum number of columns allowed in an index. SqlMaxColumnsInIndex = 545, - /// Retrieves a uint32 value representing the maximum number of columns allowed in an ORDER BY clause. + /// Retrieves a uint32 value representing the maximum number of columns allowed in an ORDER BY clause. SqlMaxColumnsInOrderBy = 546, - /// Retrieves a uint32 value representing the maximum number of columns allowed in a SELECT list. + /// Retrieves a uint32 value representing the maximum number of columns allowed in a SELECT list. SqlMaxColumnsInSelect = 547, - /// Retrieves a uint32 value representing the maximum number of columns allowed in a table. + /// Retrieves a uint32 value representing the maximum number of columns allowed in a table. SqlMaxColumnsInTable = 548, - /// Retrieves a uint32 value representing the maximum number of concurrent connections possible. + /// Retrieves a uint32 value representing the maximum number of concurrent connections possible. SqlMaxConnections = 549, - /// Retrieves a uint32 value the maximum number of characters allowed in a cursor name. + /// Retrieves a uint32 value the maximum number of characters allowed in a cursor name. SqlMaxCursorNameLength = 550, /// - /// Retrieves a uint32 value representing the maximum number of bytes allowed for an index, - /// including all of the parts of the index. + /// Retrieves a uint32 value representing the maximum number of bytes allowed for an index, + /// including all of the parts of the index. SqlMaxIndexLength = 551, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a schema name. + /// Retrieves a uint32 value representing the maximum number of characters allowed in a schema name. SqlDbSchemaNameLength = 552, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a procedure name. + /// Retrieves a uint32 value representing the maximum number of characters allowed in a procedure name. SqlMaxProcedureNameLength = 553, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a catalog name. + /// Retrieves a uint32 value representing the maximum number of characters allowed in a catalog name. SqlMaxCatalogNameLength = 554, - /// Retrieves a uint32 value representing the maximum number of bytes allowed in a single row. + /// Retrieves a uint32 value representing the maximum number of bytes allowed in a single row. SqlMaxRowSize = 555, /// - /// Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL - /// data types LONGVARCHAR and LONGVARBINARY. + /// Retrieves a boolean indicating whether the return value for the JDBC method getMaxRowSize includes the SQL + /// data types LONGVARCHAR and LONGVARBINARY. /// - /// Returns: - /// - false: if return value for the JDBC method getMaxRowSize does - /// not include the SQL data types LONGVARCHAR and LONGVARBINARY; - /// - true: if return value for the JDBC method getMaxRowSize includes - /// the SQL data types LONGVARCHAR and LONGVARBINARY. + /// Returns: + /// - false: if return value for the JDBC method getMaxRowSize does + /// not include the SQL data types LONGVARCHAR and LONGVARBINARY; + /// - true: if return value for the JDBC method getMaxRowSize includes + /// the SQL data types LONGVARCHAR and LONGVARBINARY. SqlMaxRowSizeIncludesBlobs = 556, /// - /// Retrieves a uint32 value representing the maximum number of characters allowed for an SQL statement; - /// a result of 0 (zero) means that there is no limit or the limit is not known. + /// Retrieves a uint32 value representing the maximum number of characters allowed for an SQL statement; + /// a result of 0 (zero) means that there is no limit or the limit is not known. SqlMaxStatementLength = 557, - /// Retrieves a uint32 value representing the maximum number of active statements that can be open at the same time. + /// Retrieves a uint32 value representing the maximum number of active statements that can be open at the same time. SqlMaxStatements = 558, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a table name. + /// Retrieves a uint32 value representing the maximum number of characters allowed in a table name. SqlMaxTableNameLength = 559, - /// Retrieves a uint32 value representing the maximum number of tables allowed in a SELECT statement. + /// Retrieves a uint32 value representing the maximum number of tables allowed in a SELECT statement. SqlMaxTablesInSelect = 560, - /// Retrieves a uint32 value representing the maximum number of characters allowed in a user name. + /// Retrieves a uint32 value representing the maximum number of characters allowed in a user name. SqlMaxUsernameLength = 561, /// - /// Retrieves this database's default transaction isolation level as described in - /// `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + /// Retrieves this database's default transaction isolation level as described in + /// `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. /// - /// Returns a uint32 ordinal for the SQL transaction isolation level. + /// Returns a uint32 ordinal for the SQL transaction isolation level. SqlDefaultTransactionIsolation = 562, /// - /// Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a - /// noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. + /// Retrieves a boolean value indicating whether transactions are supported. If not, invoking the method commit is a + /// noop, and the isolation level is `arrow.flight.protocol.sql.SqlTransactionIsolationLevel.TRANSACTION_NONE`. /// - /// Returns: - /// - false: if transactions are unsupported; - /// - true: if transactions are supported. + /// Returns: + /// - false: if transactions are unsupported; + /// - true: if transactions are supported. SqlTransactionsSupported = 563, /// - /// Retrieves the supported transactions isolation levels. - /// - /// Returns an int32 bitmask value representing the supported transactions isolation levels. - /// The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported SQL transactions isolation levels); - /// - return 1 (\b1) => \[SQL_TRANSACTION_NONE\]; - /// - return 2 (\b10) => \[SQL_TRANSACTION_READ_UNCOMMITTED\]; - /// - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; - /// - return 4 (\b100) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 8 (\b1000) => \[SQL_TRANSACTION_REPEATABLE_READ\]; - /// - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; - /// - return 16 (\b10000) => \[SQL_TRANSACTION_SERIALIZABLE\]; - /// - ... - /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. + /// Retrieves the supported transactions isolation levels. + /// + /// Returns an int32 bitmask value representing the supported transactions isolation levels. + /// The returned bitmask should be parsed in order to retrieve the supported transactions isolation levels. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported SQL transactions isolation levels); + /// - return 1 (\b1) => \[SQL_TRANSACTION_NONE\]; + /// - return 2 (\b10) => \[SQL_TRANSACTION_READ_UNCOMMITTED\]; + /// - return 3 (\b11) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED]; + /// - return 4 (\b100) => \[SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 5 (\b101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 6 (\b110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 7 (\b111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 8 (\b1000) => \[SQL_TRANSACTION_REPEATABLE_READ\]; + /// - return 9 (\b1001) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 10 (\b1010) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 11 (\b1011) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 12 (\b1100) => [SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 13 (\b1101) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 14 (\b1110) => [SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 15 (\b1111) => [SQL_TRANSACTION_NONE, SQL_TRANSACTION_READ_UNCOMMITTED, SQL_TRANSACTION_REPEATABLE_READ, SQL_TRANSACTION_REPEATABLE_READ]; + /// - return 16 (\b10000) => \[SQL_TRANSACTION_SERIALIZABLE\]; + /// - ... + /// Valid SQL positioned commands are described under `arrow.flight.protocol.sql.SqlTransactionIsolationLevel`. SqlSupportedTransactionsIsolationLevels = 564, /// - /// Retrieves a boolean value indicating whether a data definition statement within a transaction forces - /// the transaction to commit. + /// Retrieves a boolean value indicating whether a data definition statement within a transaction forces + /// the transaction to commit. /// - /// Returns: - /// - false: if a data definition statement within a transaction does not force the transaction to commit; - /// - true: if a data definition statement within a transaction forces the transaction to commit. + /// Returns: + /// - false: if a data definition statement within a transaction does not force the transaction to commit; + /// - true: if a data definition statement within a transaction forces the transaction to commit. SqlDataDefinitionCausesTransactionCommit = 565, /// - /// Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. + /// Retrieves a boolean value indicating whether a data definition statement within a transaction is ignored. /// - /// Returns: - /// - false: if a data definition statement within a transaction is taken into account; - /// - true: a data definition statement within a transaction is ignored. + /// Returns: + /// - false: if a data definition statement within a transaction is taken into account; + /// - true: a data definition statement within a transaction is ignored. SqlDataDefinitionsInTransactionsIgnored = 566, /// - /// Retrieves an int32 bitmask value representing the supported result set types. - /// The returned bitmask should be parsed in order to retrieve the supported result set types. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported result set types); - /// - return 1 (\b1) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED\]; - /// - return 2 (\b10) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; - /// - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; - /// - return 4 (\b100) => \[SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; - /// - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; - /// - return 8 (\b1000) => \[SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE\]; - /// - ... - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. + /// Retrieves an int32 bitmask value representing the supported result set types. + /// The returned bitmask should be parsed in order to retrieve the supported result set types. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported result set types); + /// - return 1 (\b1) => \[SQL_RESULT_SET_TYPE_UNSPECIFIED\]; + /// - return 2 (\b10) => \[SQL_RESULT_SET_TYPE_FORWARD_ONLY\]; + /// - return 3 (\b11) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY]; + /// - return 4 (\b100) => \[SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE\]; + /// - return 5 (\b101) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 6 (\b110) => [SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 7 (\b111) => [SQL_RESULT_SET_TYPE_UNSPECIFIED, SQL_RESULT_SET_TYPE_FORWARD_ONLY, SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE]; + /// - return 8 (\b1000) => \[SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE\]; + /// - ... + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetType`. SqlSupportedResultSetTypes = 567, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_UNSPECIFIED`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetUnspecified = 568, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_FORWARD_ONLY`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetForwardOnly = 569, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollSensitive = 570, /// - /// Returns an int32 bitmask value concurrency types supported for - /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. - /// - /// For instance: - /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) - /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] - /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] - /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] - /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] - /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] - /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. + /// Returns an int32 bitmask value concurrency types supported for + /// `arrow.flight.protocol.sql.SqlSupportedResultSetType.SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE`. + /// + /// For instance: + /// - return 0 (\b0) => [] (no supported concurrency types for this result set type) + /// - return 1 (\b1) => \[SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED\] + /// - return 2 (\b10) => \[SQL_RESULT_SET_CONCURRENCY_READ_ONLY\] + /// - return 3 (\b11) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY] + /// - return 4 (\b100) => \[SQL_RESULT_SET_CONCURRENCY_UPDATABLE\] + /// - return 5 (\b101) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 6 (\b110) => [SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// - return 7 (\b111) => [SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED, SQL_RESULT_SET_CONCURRENCY_READ_ONLY, SQL_RESULT_SET_CONCURRENCY_UPDATABLE] + /// Valid result set types are described under `arrow.flight.protocol.sql.SqlSupportedResultSetConcurrency`. SqlSupportedConcurrenciesForResultSetScrollInsensitive = 571, /// - /// Retrieves a boolean value indicating whether this database supports batch updates. + /// Retrieves a boolean value indicating whether this database supports batch updates. /// - /// - false: if this database does not support batch updates; - /// - true: if this database supports batch updates. + /// - false: if this database does not support batch updates; + /// - true: if this database supports batch updates. SqlBatchUpdatesSupported = 572, /// - /// Retrieves a boolean value indicating whether this database supports savepoints. + /// Retrieves a boolean value indicating whether this database supports savepoints. /// - /// Returns: - /// - false: if this database does not support savepoints; - /// - true: if this database supports savepoints. + /// Returns: + /// - false: if this database does not support savepoints; + /// - true: if this database supports savepoints. SqlSavepointsSupported = 573, /// - /// Retrieves a boolean value indicating whether named parameters are supported in callable statements. + /// Retrieves a boolean value indicating whether named parameters are supported in callable statements. /// - /// Returns: - /// - false: if named parameters in callable statements are unsupported; - /// - true: if named parameters in callable statements are supported. + /// Returns: + /// - false: if named parameters in callable statements are unsupported; + /// - true: if named parameters in callable statements are supported. SqlNamedParametersSupported = 574, /// - /// Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. + /// Retrieves a boolean value indicating whether updates made to a LOB are made on a copy or directly to the LOB. /// - /// Returns: - /// - false: if updates made to a LOB are made directly to the LOB; - /// - true: if updates made to a LOB are made on a copy. + /// Returns: + /// - false: if updates made to a LOB are made directly to the LOB; + /// - true: if updates made to a LOB are made on a copy. SqlLocatorsUpdateCopy = 575, /// - /// Retrieves a boolean value indicating whether invoking user-defined or vendor functions - /// using the stored procedure escape syntax is supported. + /// Retrieves a boolean value indicating whether invoking user-defined or vendor functions + /// using the stored procedure escape syntax is supported. /// - /// Returns: - /// - false: if invoking user-defined or vendor functions using the stored procedure escape syntax is unsupported; - /// - true: if invoking user-defined or vendor functions using the stored procedure escape syntax is supported. + /// Returns: + /// - false: if invoking user-defined or vendor functions using the stored procedure escape syntax is unsupported; + /// - true: if invoking user-defined or vendor functions using the stored procedure escape syntax is supported. SqlStoredFunctionsUsingCallSyntaxSupported = 576, } +impl SqlInfo { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlInfo::FlightSqlServerName => "FLIGHT_SQL_SERVER_NAME", + SqlInfo::FlightSqlServerVersion => "FLIGHT_SQL_SERVER_VERSION", + SqlInfo::FlightSqlServerArrowVersion => "FLIGHT_SQL_SERVER_ARROW_VERSION", + SqlInfo::FlightSqlServerReadOnly => "FLIGHT_SQL_SERVER_READ_ONLY", + SqlInfo::SqlDdlCatalog => "SQL_DDL_CATALOG", + SqlInfo::SqlDdlSchema => "SQL_DDL_SCHEMA", + SqlInfo::SqlDdlTable => "SQL_DDL_TABLE", + SqlInfo::SqlIdentifierCase => "SQL_IDENTIFIER_CASE", + SqlInfo::SqlIdentifierQuoteChar => "SQL_IDENTIFIER_QUOTE_CHAR", + SqlInfo::SqlQuotedIdentifierCase => "SQL_QUOTED_IDENTIFIER_CASE", + SqlInfo::SqlAllTablesAreSelectable => "SQL_ALL_TABLES_ARE_SELECTABLE", + SqlInfo::SqlNullOrdering => "SQL_NULL_ORDERING", + SqlInfo::SqlKeywords => "SQL_KEYWORDS", + SqlInfo::SqlNumericFunctions => "SQL_NUMERIC_FUNCTIONS", + SqlInfo::SqlStringFunctions => "SQL_STRING_FUNCTIONS", + SqlInfo::SqlSystemFunctions => "SQL_SYSTEM_FUNCTIONS", + SqlInfo::SqlDatetimeFunctions => "SQL_DATETIME_FUNCTIONS", + SqlInfo::SqlSearchStringEscape => "SQL_SEARCH_STRING_ESCAPE", + SqlInfo::SqlExtraNameCharacters => "SQL_EXTRA_NAME_CHARACTERS", + SqlInfo::SqlSupportsColumnAliasing => "SQL_SUPPORTS_COLUMN_ALIASING", + SqlInfo::SqlNullPlusNullIsNull => "SQL_NULL_PLUS_NULL_IS_NULL", + SqlInfo::SqlSupportsConvert => "SQL_SUPPORTS_CONVERT", + SqlInfo::SqlSupportsTableCorrelationNames => "SQL_SUPPORTS_TABLE_CORRELATION_NAMES", + SqlInfo::SqlSupportsDifferentTableCorrelationNames => "SQL_SUPPORTS_DIFFERENT_TABLE_CORRELATION_NAMES", + SqlInfo::SqlSupportsExpressionsInOrderBy => "SQL_SUPPORTS_EXPRESSIONS_IN_ORDER_BY", + SqlInfo::SqlSupportsOrderByUnrelated => "SQL_SUPPORTS_ORDER_BY_UNRELATED", + SqlInfo::SqlSupportedGroupBy => "SQL_SUPPORTED_GROUP_BY", + SqlInfo::SqlSupportsLikeEscapeClause => "SQL_SUPPORTS_LIKE_ESCAPE_CLAUSE", + SqlInfo::SqlSupportsNonNullableColumns => "SQL_SUPPORTS_NON_NULLABLE_COLUMNS", + SqlInfo::SqlSupportedGrammar => "SQL_SUPPORTED_GRAMMAR", + SqlInfo::SqlAnsi92SupportedLevel => "SQL_ANSI92_SUPPORTED_LEVEL", + SqlInfo::SqlSupportsIntegrityEnhancementFacility => "SQL_SUPPORTS_INTEGRITY_ENHANCEMENT_FACILITY", + SqlInfo::SqlOuterJoinsSupportLevel => "SQL_OUTER_JOINS_SUPPORT_LEVEL", + SqlInfo::SqlSchemaTerm => "SQL_SCHEMA_TERM", + SqlInfo::SqlProcedureTerm => "SQL_PROCEDURE_TERM", + SqlInfo::SqlCatalogTerm => "SQL_CATALOG_TERM", + SqlInfo::SqlCatalogAtStart => "SQL_CATALOG_AT_START", + SqlInfo::SqlSchemasSupportedActions => "SQL_SCHEMAS_SUPPORTED_ACTIONS", + SqlInfo::SqlCatalogsSupportedActions => "SQL_CATALOGS_SUPPORTED_ACTIONS", + SqlInfo::SqlSupportedPositionedCommands => "SQL_SUPPORTED_POSITIONED_COMMANDS", + SqlInfo::SqlSelectForUpdateSupported => "SQL_SELECT_FOR_UPDATE_SUPPORTED", + SqlInfo::SqlStoredProceduresSupported => "SQL_STORED_PROCEDURES_SUPPORTED", + SqlInfo::SqlSupportedSubqueries => "SQL_SUPPORTED_SUBQUERIES", + SqlInfo::SqlCorrelatedSubqueriesSupported => "SQL_CORRELATED_SUBQUERIES_SUPPORTED", + SqlInfo::SqlSupportedUnions => "SQL_SUPPORTED_UNIONS", + SqlInfo::SqlMaxBinaryLiteralLength => "SQL_MAX_BINARY_LITERAL_LENGTH", + SqlInfo::SqlMaxCharLiteralLength => "SQL_MAX_CHAR_LITERAL_LENGTH", + SqlInfo::SqlMaxColumnNameLength => "SQL_MAX_COLUMN_NAME_LENGTH", + SqlInfo::SqlMaxColumnsInGroupBy => "SQL_MAX_COLUMNS_IN_GROUP_BY", + SqlInfo::SqlMaxColumnsInIndex => "SQL_MAX_COLUMNS_IN_INDEX", + SqlInfo::SqlMaxColumnsInOrderBy => "SQL_MAX_COLUMNS_IN_ORDER_BY", + SqlInfo::SqlMaxColumnsInSelect => "SQL_MAX_COLUMNS_IN_SELECT", + SqlInfo::SqlMaxColumnsInTable => "SQL_MAX_COLUMNS_IN_TABLE", + SqlInfo::SqlMaxConnections => "SQL_MAX_CONNECTIONS", + SqlInfo::SqlMaxCursorNameLength => "SQL_MAX_CURSOR_NAME_LENGTH", + SqlInfo::SqlMaxIndexLength => "SQL_MAX_INDEX_LENGTH", + SqlInfo::SqlDbSchemaNameLength => "SQL_DB_SCHEMA_NAME_LENGTH", + SqlInfo::SqlMaxProcedureNameLength => "SQL_MAX_PROCEDURE_NAME_LENGTH", + SqlInfo::SqlMaxCatalogNameLength => "SQL_MAX_CATALOG_NAME_LENGTH", + SqlInfo::SqlMaxRowSize => "SQL_MAX_ROW_SIZE", + SqlInfo::SqlMaxRowSizeIncludesBlobs => "SQL_MAX_ROW_SIZE_INCLUDES_BLOBS", + SqlInfo::SqlMaxStatementLength => "SQL_MAX_STATEMENT_LENGTH", + SqlInfo::SqlMaxStatements => "SQL_MAX_STATEMENTS", + SqlInfo::SqlMaxTableNameLength => "SQL_MAX_TABLE_NAME_LENGTH", + SqlInfo::SqlMaxTablesInSelect => "SQL_MAX_TABLES_IN_SELECT", + SqlInfo::SqlMaxUsernameLength => "SQL_MAX_USERNAME_LENGTH", + SqlInfo::SqlDefaultTransactionIsolation => "SQL_DEFAULT_TRANSACTION_ISOLATION", + SqlInfo::SqlTransactionsSupported => "SQL_TRANSACTIONS_SUPPORTED", + SqlInfo::SqlSupportedTransactionsIsolationLevels => "SQL_SUPPORTED_TRANSACTIONS_ISOLATION_LEVELS", + SqlInfo::SqlDataDefinitionCausesTransactionCommit => "SQL_DATA_DEFINITION_CAUSES_TRANSACTION_COMMIT", + SqlInfo::SqlDataDefinitionsInTransactionsIgnored => "SQL_DATA_DEFINITIONS_IN_TRANSACTIONS_IGNORED", + SqlInfo::SqlSupportedResultSetTypes => "SQL_SUPPORTED_RESULT_SET_TYPES", + SqlInfo::SqlSupportedConcurrenciesForResultSetUnspecified => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_UNSPECIFIED", + SqlInfo::SqlSupportedConcurrenciesForResultSetForwardOnly => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_FORWARD_ONLY", + SqlInfo::SqlSupportedConcurrenciesForResultSetScrollSensitive => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_SENSITIVE", + SqlInfo::SqlSupportedConcurrenciesForResultSetScrollInsensitive => "SQL_SUPPORTED_CONCURRENCIES_FOR_RESULT_SET_SCROLL_INSENSITIVE", + SqlInfo::SqlBatchUpdatesSupported => "SQL_BATCH_UPDATES_SUPPORTED", + SqlInfo::SqlSavepointsSupported => "SQL_SAVEPOINTS_SUPPORTED", + SqlInfo::SqlNamedParametersSupported => "SQL_NAMED_PARAMETERS_SUPPORTED", + SqlInfo::SqlLocatorsUpdateCopy => "SQL_LOCATORS_UPDATE_COPY", + SqlInfo::SqlStoredFunctionsUsingCallSyntaxSupported => "SQL_STORED_FUNCTIONS_USING_CALL_SYNTAX_SUPPORTED", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedCaseSensitivity { @@ -1011,6 +1102,20 @@ pub enum SqlSupportedCaseSensitivity { SqlCaseSensitivityUppercase = 2, SqlCaseSensitivityLowercase = 3, } +impl SqlSupportedCaseSensitivity { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedCaseSensitivity::SqlCaseSensitivityUnknown => "SQL_CASE_SENSITIVITY_UNKNOWN", + SqlSupportedCaseSensitivity::SqlCaseSensitivityCaseInsensitive => "SQL_CASE_SENSITIVITY_CASE_INSENSITIVE", + SqlSupportedCaseSensitivity::SqlCaseSensitivityUppercase => "SQL_CASE_SENSITIVITY_UPPERCASE", + SqlSupportedCaseSensitivity::SqlCaseSensitivityLowercase => "SQL_CASE_SENSITIVITY_LOWERCASE", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlNullOrdering { @@ -1019,6 +1124,20 @@ pub enum SqlNullOrdering { SqlNullsSortedAtStart = 2, SqlNullsSortedAtEnd = 3, } +impl SqlNullOrdering { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlNullOrdering::SqlNullsSortedHigh => "SQL_NULLS_SORTED_HIGH", + SqlNullOrdering::SqlNullsSortedLow => "SQL_NULLS_SORTED_LOW", + SqlNullOrdering::SqlNullsSortedAtStart => "SQL_NULLS_SORTED_AT_START", + SqlNullOrdering::SqlNullsSortedAtEnd => "SQL_NULLS_SORTED_AT_END", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SupportedSqlGrammar { @@ -1026,6 +1145,19 @@ pub enum SupportedSqlGrammar { SqlCoreGrammar = 1, SqlExtendedGrammar = 2, } +impl SupportedSqlGrammar { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SupportedSqlGrammar::SqlMinimumGrammar => "SQL_MINIMUM_GRAMMAR", + SupportedSqlGrammar::SqlCoreGrammar => "SQL_CORE_GRAMMAR", + SupportedSqlGrammar::SqlExtendedGrammar => "SQL_EXTENDED_GRAMMAR", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SupportedAnsi92SqlGrammarLevel { @@ -1033,6 +1165,19 @@ pub enum SupportedAnsi92SqlGrammarLevel { Ansi92IntermediateSql = 1, Ansi92FullSql = 2, } +impl SupportedAnsi92SqlGrammarLevel { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SupportedAnsi92SqlGrammarLevel::Ansi92EntrySql => "ANSI92_ENTRY_SQL", + SupportedAnsi92SqlGrammarLevel::Ansi92IntermediateSql => "ANSI92_INTERMEDIATE_SQL", + SupportedAnsi92SqlGrammarLevel::Ansi92FullSql => "ANSI92_FULL_SQL", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlOuterJoinsSupportLevel { @@ -1040,12 +1185,37 @@ pub enum SqlOuterJoinsSupportLevel { SqlLimitedOuterJoins = 1, SqlFullOuterJoins = 2, } +impl SqlOuterJoinsSupportLevel { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlOuterJoinsSupportLevel::SqlJoinsUnsupported => "SQL_JOINS_UNSUPPORTED", + SqlOuterJoinsSupportLevel::SqlLimitedOuterJoins => "SQL_LIMITED_OUTER_JOINS", + SqlOuterJoinsSupportLevel::SqlFullOuterJoins => "SQL_FULL_OUTER_JOINS", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedGroupBy { SqlGroupByUnrelated = 0, SqlGroupByBeyondSelect = 1, } +impl SqlSupportedGroupBy { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedGroupBy::SqlGroupByUnrelated => "SQL_GROUP_BY_UNRELATED", + SqlSupportedGroupBy::SqlGroupByBeyondSelect => "SQL_GROUP_BY_BEYOND_SELECT", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedElementActions { @@ -1053,12 +1223,37 @@ pub enum SqlSupportedElementActions { SqlElementInIndexDefinitions = 1, SqlElementInPrivilegeDefinitions = 2, } +impl SqlSupportedElementActions { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedElementActions::SqlElementInProcedureCalls => "SQL_ELEMENT_IN_PROCEDURE_CALLS", + SqlSupportedElementActions::SqlElementInIndexDefinitions => "SQL_ELEMENT_IN_INDEX_DEFINITIONS", + SqlSupportedElementActions::SqlElementInPrivilegeDefinitions => "SQL_ELEMENT_IN_PRIVILEGE_DEFINITIONS", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedPositionedCommands { SqlPositionedDelete = 0, SqlPositionedUpdate = 1, } +impl SqlSupportedPositionedCommands { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedPositionedCommands::SqlPositionedDelete => "SQL_POSITIONED_DELETE", + SqlSupportedPositionedCommands::SqlPositionedUpdate => "SQL_POSITIONED_UPDATE", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedSubqueries { @@ -1067,12 +1262,38 @@ pub enum SqlSupportedSubqueries { SqlSubqueriesInIns = 2, SqlSubqueriesInQuantifieds = 3, } +impl SqlSupportedSubqueries { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedSubqueries::SqlSubqueriesInComparisons => "SQL_SUBQUERIES_IN_COMPARISONS", + SqlSupportedSubqueries::SqlSubqueriesInExists => "SQL_SUBQUERIES_IN_EXISTS", + SqlSupportedSubqueries::SqlSubqueriesInIns => "SQL_SUBQUERIES_IN_INS", + SqlSupportedSubqueries::SqlSubqueriesInQuantifieds => "SQL_SUBQUERIES_IN_QUANTIFIEDS", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedUnions { SqlUnion = 0, SqlUnionAll = 1, } +impl SqlSupportedUnions { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedUnions::SqlUnion => "SQL_UNION", + SqlSupportedUnions::SqlUnionAll => "SQL_UNION_ALL", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlTransactionIsolationLevel { @@ -1082,6 +1303,21 @@ pub enum SqlTransactionIsolationLevel { SqlTransactionRepeatableRead = 3, SqlTransactionSerializable = 4, } +impl SqlTransactionIsolationLevel { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlTransactionIsolationLevel::SqlTransactionNone => "SQL_TRANSACTION_NONE", + SqlTransactionIsolationLevel::SqlTransactionReadUncommitted => "SQL_TRANSACTION_READ_UNCOMMITTED", + SqlTransactionIsolationLevel::SqlTransactionReadCommitted => "SQL_TRANSACTION_READ_COMMITTED", + SqlTransactionIsolationLevel::SqlTransactionRepeatableRead => "SQL_TRANSACTION_REPEATABLE_READ", + SqlTransactionIsolationLevel::SqlTransactionSerializable => "SQL_TRANSACTION_SERIALIZABLE", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedTransactions { @@ -1089,6 +1325,19 @@ pub enum SqlSupportedTransactions { SqlDataDefinitionTransactions = 1, SqlDataManipulationTransactions = 2, } +impl SqlSupportedTransactions { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedTransactions::SqlTransactionUnspecified => "SQL_TRANSACTION_UNSPECIFIED", + SqlSupportedTransactions::SqlDataDefinitionTransactions => "SQL_DATA_DEFINITION_TRANSACTIONS", + SqlSupportedTransactions::SqlDataManipulationTransactions => "SQL_DATA_MANIPULATION_TRANSACTIONS", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedResultSetType { @@ -1097,6 +1346,20 @@ pub enum SqlSupportedResultSetType { SqlResultSetTypeScrollInsensitive = 2, SqlResultSetTypeScrollSensitive = 3, } +impl SqlSupportedResultSetType { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedResultSetType::SqlResultSetTypeUnspecified => "SQL_RESULT_SET_TYPE_UNSPECIFIED", + SqlSupportedResultSetType::SqlResultSetTypeForwardOnly => "SQL_RESULT_SET_TYPE_FORWARD_ONLY", + SqlSupportedResultSetType::SqlResultSetTypeScrollInsensitive => "SQL_RESULT_SET_TYPE_SCROLL_INSENSITIVE", + SqlSupportedResultSetType::SqlResultSetTypeScrollSensitive => "SQL_RESULT_SET_TYPE_SCROLL_SENSITIVE", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportedResultSetConcurrency { @@ -1104,6 +1367,19 @@ pub enum SqlSupportedResultSetConcurrency { SqlResultSetConcurrencyReadOnly = 1, SqlResultSetConcurrencyUpdatable = 2, } +impl SqlSupportedResultSetConcurrency { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyUnspecified => "SQL_RESULT_SET_CONCURRENCY_UNSPECIFIED", + SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyReadOnly => "SQL_RESULT_SET_CONCURRENCY_READ_ONLY", + SqlSupportedResultSetConcurrency::SqlResultSetConcurrencyUpdatable => "SQL_RESULT_SET_CONCURRENCY_UPDATABLE", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum SqlSupportsConvert { @@ -1128,6 +1404,36 @@ pub enum SqlSupportsConvert { SqlConvertVarbinary = 18, SqlConvertVarchar = 19, } +impl SqlSupportsConvert { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + SqlSupportsConvert::SqlConvertBigint => "SQL_CONVERT_BIGINT", + SqlSupportsConvert::SqlConvertBinary => "SQL_CONVERT_BINARY", + SqlSupportsConvert::SqlConvertBit => "SQL_CONVERT_BIT", + SqlSupportsConvert::SqlConvertChar => "SQL_CONVERT_CHAR", + SqlSupportsConvert::SqlConvertDate => "SQL_CONVERT_DATE", + SqlSupportsConvert::SqlConvertDecimal => "SQL_CONVERT_DECIMAL", + SqlSupportsConvert::SqlConvertFloat => "SQL_CONVERT_FLOAT", + SqlSupportsConvert::SqlConvertInteger => "SQL_CONVERT_INTEGER", + SqlSupportsConvert::SqlConvertIntervalDayTime => "SQL_CONVERT_INTERVAL_DAY_TIME", + SqlSupportsConvert::SqlConvertIntervalYearMonth => "SQL_CONVERT_INTERVAL_YEAR_MONTH", + SqlSupportsConvert::SqlConvertLongvarbinary => "SQL_CONVERT_LONGVARBINARY", + SqlSupportsConvert::SqlConvertLongvarchar => "SQL_CONVERT_LONGVARCHAR", + SqlSupportsConvert::SqlConvertNumeric => "SQL_CONVERT_NUMERIC", + SqlSupportsConvert::SqlConvertReal => "SQL_CONVERT_REAL", + SqlSupportsConvert::SqlConvertSmallint => "SQL_CONVERT_SMALLINT", + SqlSupportsConvert::SqlConvertTime => "SQL_CONVERT_TIME", + SqlSupportsConvert::SqlConvertTimestamp => "SQL_CONVERT_TIMESTAMP", + SqlSupportsConvert::SqlConvertTinyint => "SQL_CONVERT_TINYINT", + SqlSupportsConvert::SqlConvertVarbinary => "SQL_CONVERT_VARBINARY", + SqlSupportsConvert::SqlConvertVarchar => "SQL_CONVERT_VARCHAR", + } + } +} #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum UpdateDeleteRules { @@ -1137,3 +1443,18 @@ pub enum UpdateDeleteRules { NoAction = 3, SetDefault = 4, } +impl UpdateDeleteRules { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(&self) -> &'static str { + match self { + UpdateDeleteRules::Cascade => "CASCADE", + UpdateDeleteRules::Restrict => "RESTRICT", + UpdateDeleteRules::SetNull => "SET_NULL", + UpdateDeleteRules::NoAction => "NO_ACTION", + UpdateDeleteRules::SetDefault => "SET_DEFAULT", + } + } +} diff --git a/arrow-flight/src/sql/server.rs b/arrow-flight/src/sql/server.rs index 2d9d88638588..6e8f104dc5b8 100644 --- a/arrow-flight/src/sql/server.rs +++ b/arrow-flight/src/sql/server.rs @@ -65,77 +65,77 @@ pub trait FlightSqlService: async fn get_flight_info_statement( &self, query: CommandStatementQuery, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for executing an already created prepared statement. async fn get_flight_info_prepared_statement( &self, query: CommandPreparedStatementQuery, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for listing catalogs. async fn get_flight_info_catalogs( &self, query: CommandGetCatalogs, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for listing schemas. async fn get_flight_info_schemas( &self, query: CommandGetDbSchemas, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for listing tables. async fn get_flight_info_tables( &self, query: CommandGetTables, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about the table types. async fn get_flight_info_table_types( &self, query: CommandGetTableTypes, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo for retrieving other information (See SqlInfo). async fn get_flight_info_sql_info( &self, query: CommandGetSqlInfo, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about primary and foreign keys. async fn get_flight_info_primary_keys( &self, query: CommandGetPrimaryKeys, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about exported keys. async fn get_flight_info_exported_keys( &self, query: CommandGetExportedKeys, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about imported keys. async fn get_flight_info_imported_keys( &self, query: CommandGetImportedKeys, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; /// Get a FlightInfo to extract information about cross reference. async fn get_flight_info_cross_reference( &self, query: CommandGetCrossReference, - request: FlightDescriptor, + request: Request, ) -> Result, Status>; // do_get @@ -144,66 +144,77 @@ pub trait FlightSqlService: async fn do_get_statement( &self, ticket: TicketStatementQuery, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the prepared statement query results. async fn do_get_prepared_statement( &self, query: CommandPreparedStatementQuery, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of catalogs. async fn do_get_catalogs( &self, query: CommandGetCatalogs, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of schemas. async fn do_get_schemas( &self, query: CommandGetDbSchemas, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of tables. async fn do_get_tables( &self, query: CommandGetTables, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the table types. async fn do_get_table_types( &self, query: CommandGetTableTypes, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the list of SqlInfo results. async fn do_get_sql_info( &self, query: CommandGetSqlInfo, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the primary and foreign keys. async fn do_get_primary_keys( &self, query: CommandGetPrimaryKeys, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the exported keys. async fn do_get_exported_keys( &self, query: CommandGetExportedKeys, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the imported keys. async fn do_get_imported_keys( &self, query: CommandGetImportedKeys, + request: Request, ) -> Result::DoGetStream>, Status>; /// Get a FlightDataStream containing the data related to the cross reference. async fn do_get_cross_reference( &self, query: CommandGetCrossReference, + request: Request, ) -> Result::DoGetStream>, Status>; // do_put @@ -212,20 +223,21 @@ pub trait FlightSqlService: async fn do_put_statement_update( &self, ticket: CommandStatementUpdate, + request: Request>, ) -> Result; /// Bind parameters to given prepared statement. async fn do_put_prepared_statement_query( &self, query: CommandPreparedStatementQuery, - request: Streaming, + request: Request>, ) -> Result::DoPutStream>, Status>; /// Execute an update SQL prepared statement. async fn do_put_prepared_statement_update( &self, query: CommandPreparedStatementUpdate, - request: Streaming, + request: Request>, ) -> Result; // do_action @@ -234,12 +246,14 @@ pub trait FlightSqlService: async fn do_action_create_prepared_statement( &self, query: ActionCreatePreparedStatementRequest, + request: Request, ) -> Result; /// Close a prepared statement. async fn do_action_close_prepared_statement( &self, query: ActionClosePreparedStatementRequest, + request: Request, ); /// Register a new SqlInfo result, making it available when calling GetSqlInfo. @@ -287,119 +301,87 @@ where &self, request: Request, ) -> Result, Status> { - let request = request.into_inner(); let any: prost_types::Any = - prost::Message::decode(&*request.cmd).map_err(decode_error_to_status)?; + Message::decode(&*request.get_ref().cmd).map_err(decode_error_to_status)?; if any.is::() { - return self - .get_flight_info_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_statement(token, request).await; } if any.is::() { + let handle = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); return self - .get_flight_info_prepared_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) + .get_flight_info_prepared_statement(handle, request) .await; } if any.is::() { - return self - .get_flight_info_catalogs( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_catalogs(token, request).await; } if any.is::() { - return self - .get_flight_info_schemas( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_schemas(token, request).await; } if any.is::() { - return self - .get_flight_info_tables( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_tables(token, request).await; } if any.is::() { - return self - .get_flight_info_table_types( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_table_types(token, request).await; } if any.is::() { - return self - .get_flight_info_sql_info( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_sql_info(token, request).await; } if any.is::() { - return self - .get_flight_info_primary_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_primary_keys(token, request).await; } if any.is::() { - return self - .get_flight_info_exported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_exported_keys(token, request).await; } if any.is::() { - return self - .get_flight_info_imported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_imported_keys(token, request).await; } if any.is::() { - return self - .get_flight_info_cross_reference( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.get_flight_info_cross_reference(token, request).await; } Err(Status::unimplemented(format!( @@ -419,161 +401,131 @@ where &self, request: Request, ) -> Result, Status> { - let request = request.into_inner(); - let any: prost_types::Any = - prost::Message::decode(&*request.ticket).map_err(decode_error_to_status)?; + let any: prost_types::Any = prost::Message::decode(&*request.get_ref().ticket) + .map_err(decode_error_to_status)?; if any.is::() { - return self - .do_get_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_statement(token, request).await; } if any.is::() { - return self - .do_get_prepared_statement( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_prepared_statement(token, request).await; } if any.is::() { - return self - .do_get_catalogs( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_catalogs(token, request).await; } if any.is::() { - return self - .do_get_schemas( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_schemas(token, request).await; } if any.is::() { - return self - .do_get_tables( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_tables(token, request).await; } if any.is::() { - return self - .do_get_table_types( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_table_types(token, request).await; } if any.is::() { - return self - .do_get_sql_info( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_sql_info(token, request).await; } if any.is::() { - return self - .do_get_primary_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_primary_keys(token, request).await; } if any.is::() { - return self - .do_get_exported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_exported_keys(token, request).await; } if any.is::() { - return self - .do_get_imported_keys( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_imported_keys(token, request).await; } if any.is::() { - return self - .do_get_cross_reference( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_get_cross_reference(token, request).await; } Err(Status::unimplemented(format!( "do_get: The defined request is invalid: {:?}", - String::from_utf8(request.ticket).unwrap() + String::from_utf8(request.get_ref().ticket.clone()).unwrap() ))) } async fn do_put( &self, - request: Request>, + mut request: Request>, ) -> Result, Status> { - let mut request = request.into_inner(); - let cmd = request.message().await?.unwrap(); + let cmd = request.get_mut().message().await?.unwrap(); let any: prost_types::Any = prost::Message::decode(&*cmd.flight_descriptor.unwrap().cmd) .map_err(decode_error_to_status)?; if any.is::() { - let record_count = self - .do_put_statement_update( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - ) - .await?; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + let record_count = self.do_put_statement_update(token, request).await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(super::super::gen::PutResult { - app_metadata: result.as_any().encode_to_vec(), + app_metadata: result.encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); } if any.is::() { - return self - .do_put_prepared_statement_query( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) - .await; + let token = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); + return self.do_put_prepared_statement_query(token, request).await; } if any.is::() { + let handle = any + .unpack() + .map_err(arrow_error_to_status)? + .expect("unreachable"); let record_count = self - .do_put_prepared_statement_update( - any.unpack() - .map_err(arrow_error_to_status)? - .expect("unreachable"), - request, - ) + .do_put_prepared_statement_update(handle, request) .await?; let result = DoPutUpdateResult { record_count }; let output = futures::stream::iter(vec![Ok(super::super::gen::PutResult { - app_metadata: result.as_any().encode_to_vec(), + app_metadata: result.encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); } @@ -614,11 +566,9 @@ where &self, request: Request, ) -> Result, Status> { - let request = request.into_inner(); - - if request.r#type == CREATE_PREPARED_STATEMENT { - let any: prost_types::Any = - prost::Message::decode(&*request.body).map_err(decode_error_to_status)?; + if request.get_ref().r#type == CREATE_PREPARED_STATEMENT { + let any: prost_types::Any = Message::decode(&*request.get_ref().body) + .map_err(decode_error_to_status)?; let cmd: ActionCreatePreparedStatementRequest = any .unpack() @@ -628,15 +578,17 @@ where "Unable to unpack ActionCreatePreparedStatementRequest.", ) })?; - let stmt = self.do_action_create_prepared_statement(cmd).await?; + let stmt = self + .do_action_create_prepared_statement(cmd, request) + .await?; let output = futures::stream::iter(vec![Ok(super::super::gen::Result { body: stmt.as_any().encode_to_vec(), })]); return Ok(Response::new(Box::pin(output))); } - if request.r#type == CLOSE_PREPARED_STATEMENT { - let any: prost_types::Any = - prost::Message::decode(&*request.body).map_err(decode_error_to_status)?; + if request.get_ref().r#type == CLOSE_PREPARED_STATEMENT { + let any: prost_types::Any = Message::decode(&*request.get_ref().body) + .map_err(decode_error_to_status)?; let cmd: ActionClosePreparedStatementRequest = any .unpack() @@ -646,13 +598,13 @@ where "Unable to unpack ActionClosePreparedStatementRequest.", ) })?; - self.do_action_close_prepared_statement(cmd).await; + self.do_action_close_prepared_statement(cmd, request).await; return Ok(Response::new(Box::pin(futures::stream::empty()))); } Err(Status::invalid_argument(format!( "do_action: The defined request is invalid: {:?}", - request.r#type + request.get_ref().r#type ))) } diff --git a/arrow-pyarrow-integration-testing/Cargo.toml b/arrow-pyarrow-integration-testing/Cargo.toml index 6139ff7702c5..19117ba5f03e 100644 --- a/arrow-pyarrow-integration-testing/Cargo.toml +++ b/arrow-pyarrow-integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-pyarrow-integration-testing" description = "" -version = "19.0.0" +version = "20.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -32,7 +32,7 @@ name = "arrow_pyarrow_integration_testing" crate-type = ["cdylib"] [dependencies] -arrow = { path = "../arrow", version = "19.0.0", features = ["pyarrow"] } +arrow = { path = "../arrow", version = "20.0.0", features = ["pyarrow"] } pyo3 = { version = "0.16", features = ["extension-module"] } [package.metadata.maturin] diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 9f2c4940c400..bebaadcbc69f 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -17,13 +17,13 @@ [package] name = "arrow" -version = "19.0.0" +version = "20.0.0" description = "Rust implementation of Apache Arrow" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] license = "Apache-2.0" -keywords = [ "arrow" ] +keywords = ["arrow"] include = [ "benches/*.rs", "src/**/*.rs", @@ -37,24 +37,29 @@ name = "arrow" path = "src/lib.rs" bench = false +[target.'cfg(target_arch = "wasm32")'.dependencies] +ahash = { version = "0.8", default-features = false, features=["compile-time-rng"] } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +ahash = { version = "0.8", default-features = false, features=["runtime-rng"] } + [dependencies] -lz4 = { version = "1.23", default-features = false, optional = true } -zstd = { version = "0.11.1", default-features = false, optional = true } -ahash = { version = "0.7", default-features = false } serde = { version = "1.0", default-features = false } serde_derive = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } indexmap = { version = "1.9", default-features = false, features = ["std"] } -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } half = { version = "2.0", default-features = false } hashbrown = { version = "0.12", default-features = false } -csv_crate = { version = "1.1", default-features = false, optional = true, package="csv" } +csv_crate = { version = "1.1", default-features = false, optional = true, package = "csv" } regex = { version = "1.5.6", default-features = false, features = ["std", "unicode"] } +regex-syntax = { version = "0.6.27", default-features = false, features = ["unicode"] } lazy_static = { version = "1.4", default-features = false } +lz4 = { version = "1.23", default-features = false, optional = true } packed_simd = { version = "0.3", default-features = false, optional = true, package = "packed_simd_2" } chrono = { version = "0.4", default-features = false, features = ["clock"] } -chrono-tz = {version = "0.6", default-features = false, optional = true} +chrono-tz = { version = "0.6", default-features = false, optional = true } flatbuffers = { version = "2.1.2", default-features = false, features = ["thiserror"], optional = true } hex = { version = "0.4", default-features = false, features = ["std"] } comfy-table = { version = "6.0", optional = true, default-features = false } @@ -62,10 +67,11 @@ pyo3 = { version = "0.16", default-features = false, optional = true } lexical-core = { version = "^0.8", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } multiversion = { version = "0.6.1", default-features = false } bitflags = { version = "1.2.1", default-features = false } +zstd = { version = "0.11.1", default-features = false, optional = true } [features] -default = ["csv", "ipc", "test_utils"] -ipc_compression = ["zstd", "lz4"] +default = ["csv", "ipc"] +ipc_compression = ["ipc", "zstd", "lz4"] csv = ["csv_crate"] ipc = ["flatbuffers"] simd = ["packed_simd"] @@ -75,22 +81,27 @@ prettyprint = ["comfy-table"] # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. test_utils = ["rand"] -pyarrow = ["pyo3"] +pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive # but is run as part of our CI checks force_validate = [] +# Enable ffi support +ffi = [] [dev-dependencies] -rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } criterion = { version = "0.3", default-features = false } flate2 = { version = "1", default-features = false, features = ["rust_backend"] } tempfile = { version = "3", default-features = false } -lz4 = { version = "1.23", default-features = false } -zstd = { version = "0.11", default-features = false } [build-dependencies] +[[example]] +name = "dynamic_types" +required-features = ["prettyprint"] +path="./examples/dynamic_types.rs" + [[bench]] name = "aggregate_kernels" harness = false @@ -201,3 +212,7 @@ required-features = ["test_utils"] [[bench]] name = "array_data_validate" harness = false + +[[bench]] +name = "decimal_validate" +harness = false diff --git a/arrow/README.md b/arrow/README.md index d26a4f410c23..5e20a42538d5 100644 --- a/arrow/README.md +++ b/arrow/README.md @@ -22,7 +22,10 @@ [![crates.io](https://img.shields.io/crates/v/arrow.svg)](https://crates.io/crates/arrow) [![docs.rs](https://img.shields.io/docsrs/arrow.svg)](https://docs.rs/arrow/latest/arrow/) -This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. Additional details can be found on [crates.io](https://crates.io/crates/arrow), [docs.rs](https://docs.rs/arrow/latest/arrow/) and [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples). +This crate contains the official Native Rust implementation of [Apache Arrow][arrow] in memory format, governed by the Apache Software Foundation. + +The [crate documentation](https://docs.rs/arrow/latest/arrow/) contains examples and full API. +There are several [examples](https://github.com/apache/arrow-rs/tree/master/arrow/examples) to start from as well. ## Rust Version Compatibility @@ -32,20 +35,27 @@ This crate is tested with the latest stable version of Rust. We do not currently The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. -However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `19.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `20.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. -## Features +## Feature Flags -The arrow crate provides the following features which may be enabled: +The `arrow` crate provides the following features which may be enabled in your `Cargo.toml`: - `csv` (default) - support for reading and writing Arrow arrays to/from csv files -- `ipc` (default) - support for the [arrow-flight](https://crates.io/crates/arrow-flight) IPC and wire format +- `ipc` (default) - support for reading [Arrow IPC Format](https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc), also used as the wire protocol in [arrow-flight](https://crates.io/crates/arrow-flight) +- `ipc_compression` - Enables reading and writing compressed IPC streams (also enables `ipc`) - `prettyprint` - support for formatting record batches as textual columns - `js` - support for building arrow for WebAssembly / JavaScript -- `simd` - (_Requires Nightly Rust_) alternate optimized +- `simd` - (_Requires Nightly Rust_) Use alternate hand optimized implementations of some [compute](https://github.com/apache/arrow-rs/tree/master/arrow/src/compute/kernels) - kernels using explicit SIMD instructions available through [packed_simd_2](https://docs.rs/packed_simd_2/latest/packed_simd_2/). + kernels using explicit SIMD instructions via [packed_simd_2](https://docs.rs/packed_simd_2/latest/packed_simd_2/). - `chrono-tz` - support of parsing timezone using [chrono-tz](https://docs.rs/chrono-tz/0.6.0/chrono_tz/) +- `ffi` - bindings for the Arrow C [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) +- `pyarrow` - bindings for pyo3 to call arrow-rs from python + +## Arrow Feature Status + +The [Apache Arrow Status](https://arrow.apache.org/docs/status.html) page lists which features of Arrow this crate supports. ## Safety @@ -55,25 +65,25 @@ Arrow seeks to uphold the Rust Soundness Pledge as articulated eloquently [here] Where soundness in turn is defined as: -> Code is unable to trigger undefined behaviour using safe APIs +> Code is unable to trigger undefined behavior using safe APIs -One way to ensure this would be to not use `unsafe`, however, as described in the opening chapter of the [Rustonomicon](https://doc.rust-lang.org/nomicon/meet-safe-and-unsafe.html) this is not a requirement, and flexibility in this regard is actually one of Rust's great strengths. +One way to ensure this would be to not use `unsafe`, however, as described in the opening chapter of the [Rustonomicon](https://doc.rust-lang.org/nomicon/meet-safe-and-unsafe.html) this is not a requirement, and flexibility in this regard is one of Rust's great strengths. In particular there are a number of scenarios where `unsafe` is largely unavoidable: -* Invariants that cannot be statically verified by the compiler and unlock non-trivial performance wins, e.g. values in a StringArray are UTF-8, [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html) iterators, etc... -* FFI -* SIMD +- Invariants that cannot be statically verified by the compiler and unlock non-trivial performance wins, e.g. values in a StringArray are UTF-8, [TrustedLen](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html) iterators, etc... +- FFI +- SIMD -Additionally, this crate exposes a number of `unsafe` APIs, allowing downstream crates to explicitly opt-out of potentially expensive invariant checking where appropriate. +Additionally, this crate exposes a number of `unsafe` APIs, allowing downstream crates to explicitly opt-out of potentially expensive invariant checking where appropriate. We have a number of strategies to help reduce this risk: -* Provide strongly-typed `Array` and `ArrayBuilder` APIs to safely and efficiently interact with arrays -* Extensive validation logic to safely construct `ArrayData` from untrusted sources -* All commits are verified using [MIRI](https://github.com/rust-lang/miri) to detect undefined behaviour -* We provide a `force_validate` feature that enables additional validation checks for use in test/debug builds -* There is ongoing work to reduce and better document the use of unsafe, and we welcome contributions in this space +- Provide strongly-typed `Array` and `ArrayBuilder` APIs to safely and efficiently interact with arrays +- Extensive validation logic to safely construct `ArrayData` from untrusted sources +- All commits are verified using [MIRI](https://github.com/rust-lang/miri) to detect undefined behaviour +- Use a `force_validate` feature that enables additional validation checks for use in test/debug builds +- There is ongoing work to reduce and better document the use of unsafe, and we welcome contributions in this space ## Building for WASM @@ -101,16 +111,38 @@ cargo run --example read_csv [arrow]: https://arrow.apache.org/ +## Performance Tips -## Performance +Arrow aims to be as fast as possible out of the box, whilst not compromising on safety. However, +it relies heavily on LLVM auto-vectorisation to achieve this. Unfortunately the LLVM defaults, +particularly for x86_64, favour portability over performance, and LLVM will consequently avoid +using more recent instructions that would result in errors on older CPUs. -Most of the compute kernels benefit a lot from being optimized for a specific CPU target. -This is especially so on x86-64 since without specifying a target the compiler can only assume support for SSE2 vector instructions. -One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can therefore improve performance significantly: +To address this it is recommended that you override the LLVM defaults either +by setting the `RUSTFLAGS` environment variable, or by setting `rustflags` in your +[Cargo configuration](https://doc.rust-lang.org/cargo/reference/config.html) - - `native`: Target the exact features of the cpu that the build is running on. - This should give the best performance when building and running locally, but should be used carefully for example when building in a CI pipeline or when shipping pre-compiled software. - - `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` architecture released in 2013 and should be supported by any recent Intel or Amd cpu. - - `x86-64-v4`: Includes AVX512 support available on intel `skylake` server and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors. +Enable all features supported by the current CPU -These flags should be used in addition to the `simd` feature, since they will also affect the code generated by the simd library. \ No newline at end of file +```ignore +RUSTFLAGS="-C target-cpu=native" +``` + +Enable all features supported by the current CPU, and enable full use of AVX512 + +```ignore +RUSTFLAGS="-C target-cpu=native -C target-feature=-prefer-256-bit" +``` + +Enable all features supported by CPUs more recent than haswell (2013) + +```ignore +RUSTFLAGS="-C target-cpu=haswell" +``` + +For a full list of features and target CPUs use + +```shell +$ rustc --print target-cpus +$ rustc --print target-features +``` diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs new file mode 100644 index 000000000000..1c9406abc783 --- /dev/null +++ b/arrow/benches/decimal_validate.rs @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; + +use arrow::array::{ + Array, Decimal128Array, Decimal128Builder, Decimal256Array, Decimal256Builder, +}; +use criterion::Criterion; +use num::BigInt; +use rand::Rng; + +extern crate arrow; + +use arrow::util::decimal::Decimal256; + +fn validate_decimal128_array(array: Decimal128Array) { + array.with_precision_and_scale(35, 0).unwrap(); +} + +fn validate_decimal256_array(array: Decimal256Array) { + array.with_precision_and_scale(35, 0).unwrap(); +} + +fn validate_decimal128_benchmark(c: &mut Criterion) { + let mut rng = rand::thread_rng(); + let size: i128 = 20000; + let mut decimal_builder = Decimal128Builder::new(size as usize, 38, 0); + for _ in 0..size { + decimal_builder + .append_value(rng.gen_range::(0..999999999999)) + .unwrap(); + } + let decimal_array = decimal_builder.finish(); + let data = decimal_array.into_data(); + c.bench_function("validate_decimal128_array 20000", |b| { + b.iter(|| { + let array = Decimal128Array::from(data.clone()); + validate_decimal128_array(array); + }) + }); +} + +fn validate_decimal256_benchmark(c: &mut Criterion) { + let mut rng = rand::thread_rng(); + let size: i128 = 20000; + let mut decimal_builder = Decimal256Builder::new(size as usize, 76, 0); + for _ in 0..size { + let v = rng.gen_range::(0..999999999999999); + let decimal = Decimal256::from_big_int(&BigInt::from(v), 76, 0).unwrap(); + decimal_builder.append_value(&decimal).unwrap(); + } + let decimal_array256_data = decimal_builder.finish(); + let data = decimal_array256_data.into_data(); + c.bench_function("validate_decimal256_array 20000", |b| { + b.iter(|| { + let array = Decimal256Array::from(data.clone()); + validate_decimal256_array(array); + }) + }); +} + +criterion_group!( + benches, + validate_decimal128_benchmark, + validate_decimal256_benchmark, +); +criterion_main!(benches); diff --git a/arrow/examples/dynamic_types.rs b/arrow/examples/dynamic_types.rs index f98596f2e777..eefbf6dcd4ff 100644 --- a/arrow/examples/dynamic_types.rs +++ b/arrow/examples/dynamic_types.rs @@ -65,10 +65,7 @@ fn main() -> Result<()> { let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id), Arc::new(nested)])?; - #[cfg(feature = "prettyprint")] - { - print_batches(&[batch.clone()]).unwrap(); - } + print_batches(&[batch.clone()]).unwrap(); process(&batch); Ok(()) @@ -108,8 +105,5 @@ fn process(batch: &RecordBatch) { ) .unwrap(); - #[cfg(feature = "prettyprint")] - { - print_batches(&[projection]).unwrap(); - } + print_batches(&[projection]).unwrap(); } diff --git a/arrow/src/array/array.rs b/arrow/src/array/array.rs index 5c5231296316..9766f857c727 100644 --- a/arrow/src/array/array.rs +++ b/arrow/src/array/array.rs @@ -16,19 +16,16 @@ // under the License. use std::any::Any; -use std::convert::{From, TryFrom}; +use std::convert::From; use std::fmt; use std::sync::Arc; use super::*; -use crate::array::equal_json::JsonEqual; use crate::buffer::{Buffer, MutableBuffer}; -use crate::error::Result; -use crate::ffi; /// Trait for dealing with different types of array at runtime when the type of the /// array is not known in advance. -pub trait Array: fmt::Debug + Send + Sync + JsonEqual { +pub trait Array: fmt::Debug + Send + Sync { /// Returns the array as [`Any`](std::any::Any) so that it can be /// downcasted to a specific implementation. /// @@ -216,15 +213,6 @@ pub trait Array: fmt::Debug + Send + Sync + JsonEqual { self.data_ref().get_array_memory_size() + std::mem::size_of_val(self) - std::mem::size_of::() } - - /// returns two pointers that represent this array in the C Data Interface (FFI) - fn to_raw( - &self, - ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { - let data = self.data().clone(); - let array = ffi::ArrowArray::try_from(data)?; - Ok(ffi::ArrowArray::into_raw(array)) - } } /// A reference-counted reference to a generic `Array`. @@ -287,14 +275,6 @@ impl Array for ArrayRef { fn get_array_memory_size(&self) -> usize { self.as_ref().get_array_memory_size() } - - fn to_raw( - &self, - ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { - let data = self.data().clone(); - let array = ffi::ArrowArray::try_from(data)?; - Ok(ffi::ArrowArray::into_raw(array)) - } } impl<'a, T: Array> Array for &'a T { @@ -353,12 +333,6 @@ impl<'a, T: Array> Array for &'a T { fn get_array_memory_size(&self) -> usize { T::get_array_memory_size(self) } - - fn to_raw( - &self, - ) -> Result<(*const ffi::FFI_ArrowArray, *const ffi::FFI_ArrowSchema)> { - T::to_raw(self) - } } /// A generic trait for accessing the values of an [`Array`] @@ -482,7 +456,7 @@ pub fn make_array(data: ArrayData) -> ArrayRef { dt => panic!("Unexpected dictionary key type {:?}", dt), }, DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, - DataType::Decimal(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, + DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, dt => panic!("Unexpected data type {:?}", dt), } @@ -647,7 +621,7 @@ pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef { ) }) } - DataType::Decimal(_, _) => { + DataType::Decimal128(_, _) => { new_null_sized_decimal(data_type, length, std::mem::size_of::()) } DataType::Decimal256(_, _) => new_null_sized_decimal(data_type, length, 32), @@ -733,42 +707,6 @@ fn new_null_sized_decimal( }) } -/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface -/// # Safety -/// Assumes that these pointers represent valid C Data Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -pub unsafe fn make_array_from_raw( - array: *const ffi::FFI_ArrowArray, - schema: *const ffi::FFI_ArrowSchema, -) -> Result { - let array = ffi::ArrowArray::try_from_raw(array, schema)?; - let data = ArrayData::try_from(array)?; - Ok(make_array(data)) -} - -/// Exports an array to raw pointers of the C Data Interface provided by the consumer. -/// # Safety -/// Assumes that these pointers represent valid C Data Interfaces, both in memory -/// representation and lifetime via the `release` mechanism. -/// -/// This function copies the content of two FFI structs [ffi::FFI_ArrowArray] and -/// [ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers. -/// Usually the raw pointers are provided by the array data consumer. -pub unsafe fn export_array_into_raw( - src: ArrayRef, - out_array: *mut ffi::FFI_ArrowArray, - out_schema: *mut ffi::FFI_ArrowSchema, -) -> Result<()> { - let data = src.data(); - let array = ffi::FFI_ArrowArray::new(data); - let schema = ffi::FFI_ArrowSchema::try_from(data.data_type())?; - - std::ptr::write_unaligned(out_array, array); - std::ptr::write_unaligned(out_schema, schema); - - Ok(()) -} - // Helper function for printing potentially long arrays. pub(super) fn print_long_array( array: &A, diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index 4848a25a058d..12c6978107d1 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -20,12 +20,11 @@ use std::fmt; use std::{any::Any, iter::FromIterator}; use super::{ - array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, - FixedSizeListArray, GenericBinaryIter, GenericListArray, OffsetSizeTrait, + array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, GenericBinaryIter, + GenericListArray, OffsetSizeTrait, }; use crate::array::array::ArrayAccessor; use crate::buffer::Buffer; -use crate::error::{ArrowError, Result}; use crate::util::bit_util; use crate::{buffer::MutableBuffer, datatypes::DataType}; @@ -38,15 +37,17 @@ pub struct GenericBinaryArray { } impl GenericBinaryArray { + /// Data type of the array. + pub const DATA_TYPE: DataType = if OffsetSize::IS_LARGE { + DataType::LargeBinary + } else { + DataType::Binary + }; + /// Get the data type of the array. - // Declare this function as `pub const fn` after - // https://github.com/rust-lang/rust/issues/93706 is merged. - pub fn get_data_type() -> DataType { - if OffsetSize::IS_LARGE { - DataType::LargeBinary - } else { - DataType::Binary - } + #[deprecated(note = "please use `Self::DATA_TYPE` instead")] + pub const fn get_data_type() -> DataType { + Self::DATA_TYPE } /// Returns the length for value at index `i`. @@ -135,21 +136,35 @@ impl GenericBinaryArray { fn from_list(v: GenericListArray) -> Self { assert_eq!( - v.data_ref().child_data()[0].child_data().len(), + v.data_ref().child_data().len(), + 1, + "BinaryArray can only be created from list array of u8 values \ + (i.e. List>)." + ); + let child_data = &v.data_ref().child_data()[0]; + + assert_eq!( + child_data.child_data().len(), 0, "BinaryArray can only be created from list array of u8 values \ (i.e. List>)." ); assert_eq!( - v.data_ref().child_data()[0].data_type(), + child_data.data_type(), &DataType::UInt8, "BinaryArray can only be created from List arrays, mismatched data types." ); + assert_eq!( + child_data.null_count(), + 0, + "The child array cannot contain null values." + ); - let builder = ArrayData::builder(Self::get_data_type()) + let builder = ArrayData::builder(Self::DATA_TYPE) .len(v.len()) + .offset(v.offset()) .add_buffer(v.data_ref().buffers()[0].clone()) - .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()) + .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .null_bit_buffer(v.data_ref().null_buffer().cloned()); let data = unsafe { builder.build_unchecked() }; @@ -184,7 +199,7 @@ impl GenericBinaryArray { assert!(!offsets.is_empty()); // wrote at least one let actual_len = (offsets.len() / std::mem::size_of::()) - 1; - let array_data = ArrayData::builder(Self::get_data_type()) + let array_data = ArrayData::builder(Self::DATA_TYPE) .len(actual_len) .add_buffer(offsets.into()) .add_buffer(values.into()); @@ -221,7 +236,7 @@ impl<'a, T: OffsetSizeTrait> GenericBinaryArray { impl fmt::Debug for GenericBinaryArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" }; + let prefix = OffsetSize::PREFIX; write!(f, "{}BinaryArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { @@ -263,7 +278,7 @@ impl From for GenericBinaryArray Self { assert_eq!( data.data_type(), - &Self::get_data_type(), + &Self::DATA_TYPE, "[Large]BinaryArray expects Datatype::[Large]Binary" ); assert_eq!( @@ -287,6 +302,26 @@ impl From> for Array } } +impl From>> + for GenericBinaryArray +{ + fn from(v: Vec>) -> Self { + Self::from_opt_vec(v) + } +} + +impl From> for GenericBinaryArray { + fn from(v: Vec<&[u8]>) -> Self { + Self::from_iter_values(v) + } +} + +impl From> for GenericBinaryArray { + fn from(v: GenericListArray) -> Self { + Self::from_list(v) + } +} + impl FromIterator> for GenericBinaryArray where @@ -320,7 +355,7 @@ where // calculate actual data_len, which may be different from the iterator's upper bound let data_len = offsets.len() - 1; - let array_data = ArrayData::builder(Self::get_data_type()) + let array_data = ArrayData::builder(Self::DATA_TYPE) .len(data_len) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) @@ -330,6 +365,15 @@ where } } +impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { + type Item = Option<&'a [u8]>; + type IntoIter = GenericBinaryIter<'a, T>; + + fn into_iter(self) -> Self::IntoIter { + GenericBinaryIter::<'a, T>::new(self) + } +} + /// An array where each element contains 0 or more bytes. /// The byte length of each element is represented by an i32. /// @@ -410,367 +454,10 @@ pub type BinaryArray = GenericBinaryArray; /// pub type LargeBinaryArray = GenericBinaryArray; -impl<'a, T: OffsetSizeTrait> IntoIterator for &'a GenericBinaryArray { - type Item = Option<&'a [u8]>; - type IntoIter = GenericBinaryIter<'a, T>; - - fn into_iter(self) -> Self::IntoIter { - GenericBinaryIter::<'a, T>::new(self) - } -} - -impl From>> - for GenericBinaryArray -{ - fn from(v: Vec>) -> Self { - Self::from_opt_vec(v) - } -} - -impl From> for GenericBinaryArray { - fn from(v: Vec<&[u8]>) -> Self { - Self::from_iter_values(v) - } -} - -impl From> for GenericBinaryArray { - fn from(v: GenericListArray) -> Self { - Self::from_list(v) - } -} - -/// An array where each element is a fixed-size sequence of bytes. -/// -/// # Examples -/// -/// Create an array from an iterable argument of byte slices. -/// -/// ``` -/// use arrow::array::{Array, FixedSizeBinaryArray}; -/// let input_arg = vec![ vec![1, 2], vec![3, 4], vec![5, 6] ]; -/// let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); -/// -/// assert_eq!(3, arr.len()); -/// -/// ``` -/// Create an array from an iterable argument of sparse byte slices. -/// Sparsity means that the input argument can contain `None` items. -/// ``` -/// use arrow::array::{Array, FixedSizeBinaryArray}; -/// let input_arg = vec![ None, Some(vec![7, 8]), Some(vec![9, 10]), None, Some(vec![13, 14]) ]; -/// let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); -/// assert_eq!(5, arr.len()) -/// -/// ``` -/// -pub struct FixedSizeBinaryArray { - data: ArrayData, - value_data: RawPtrBox, - length: i32, -} - -impl FixedSizeBinaryArray { - /// Returns the element at index `i` as a byte slice. - pub fn value(&self, i: usize) -> &[u8] { - assert!( - i < self.data.len(), - "FixedSizeBinaryArray out of bounds access" - ); - let offset = i + self.data.offset(); - unsafe { - let pos = self.value_offset_at(offset); - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(pos as isize), - (self.value_offset_at(offset + 1) - pos) as usize, - ) - } - } - - /// Returns the element at index `i` as a byte slice. - /// # Safety - /// Caller is responsible for ensuring that the index is within the bounds of the array - pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { - let offset = i + self.data.offset(); - let pos = self.value_offset_at(offset); - std::slice::from_raw_parts( - self.value_data.as_ptr().offset(pos as isize), - (self.value_offset_at(offset + 1) - pos) as usize, - ) - } - - /// Returns the offset for the element at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) - } - - /// Returns the length for an element. - /// - /// All elements have the same length as the array is a fixed size. - #[inline] - pub fn value_length(&self) -> i32 { - self.length - } - - /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[0].clone() - } - - /// Create an array from an iterable argument of sparse byte slices. - /// Sparsity means that items returned by the iterator are optional, i.e input argument can - /// contain `None` items. - /// - /// # Examples - /// - /// ``` - /// use arrow::array::FixedSizeBinaryArray; - /// let input_arg = vec![ - /// None, - /// Some(vec![7, 8]), - /// Some(vec![9, 10]), - /// None, - /// Some(vec![13, 14]), - /// None, - /// ]; - /// let array = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); - /// ``` - /// - /// # Errors - /// - /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_sparse_iter(mut iter: T) -> Result - where - T: Iterator>, - U: AsRef<[u8]>, - { - let mut len = 0; - let mut size = None; - let mut byte = 0; - let mut null_buf = MutableBuffer::from_len_zeroed(0); - let mut buffer = MutableBuffer::from_len_zeroed(0); - let mut prepend = 0; - iter.try_for_each(|item| -> Result<()> { - // extend null bitmask by one byte per each 8 items - if byte == 0 { - null_buf.push(0u8); - byte = 8; - } - byte -= 1; - - if let Some(slice) = item { - let slice = slice.as_ref(); - if let Some(size) = size { - if size != slice.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Nested array size mismatch: one is {}, and the other is {}", - size, - slice.len() - ))); - } - } else { - size = Some(slice.len()); - buffer.extend_zeros(slice.len() * prepend); - } - bit_util::set_bit(null_buf.as_slice_mut(), len); - buffer.extend_from_slice(slice); - } else if let Some(size) = size { - buffer.extend_zeros(size); - } else { - prepend += 1; - } - - len += 1; - - Ok(()) - })?; - - if len == 0 { - return Err(ArrowError::InvalidArgumentError( - "Input iterable argument has no data".to_owned(), - )); - } - - let size = size.unwrap_or(0); - let array_data = unsafe { - ArrayData::new_unchecked( - DataType::FixedSizeBinary(size as i32), - len, - None, - Some(null_buf.into()), - 0, - vec![buffer.into()], - vec![], - ) - }; - Ok(FixedSizeBinaryArray::from(array_data)) - } - - /// Create an array from an iterable argument of byte slices. - /// - /// # Examples - /// - /// ``` - /// use arrow::array::FixedSizeBinaryArray; - /// let input_arg = vec![ - /// vec![1, 2], - /// vec![3, 4], - /// vec![5, 6], - /// ]; - /// let array = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); - /// ``` - /// - /// # Errors - /// - /// Returns error if argument has length zero, or sizes of nested slices don't match. - pub fn try_from_iter(mut iter: T) -> Result - where - T: Iterator, - U: AsRef<[u8]>, - { - let mut len = 0; - let mut size = None; - let mut buffer = MutableBuffer::from_len_zeroed(0); - iter.try_for_each(|item| -> Result<()> { - let slice = item.as_ref(); - if let Some(size) = size { - if size != slice.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Nested array size mismatch: one is {}, and the other is {}", - size, - slice.len() - ))); - } - } else { - size = Some(slice.len()); - } - buffer.extend_from_slice(slice); - - len += 1; - - Ok(()) - })?; - - if len == 0 { - return Err(ArrowError::InvalidArgumentError( - "Input iterable argument has no data".to_owned(), - )); - } - - let size = size.unwrap_or(0); - let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32)) - .len(len) - .add_buffer(buffer.into()); - let array_data = unsafe { array_data.build_unchecked() }; - Ok(FixedSizeBinaryArray::from(array_data)) - } - - #[inline] - fn value_offset_at(&self, i: usize) -> i32 { - self.length * i as i32 - } -} - -impl From for FixedSizeBinaryArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "FixedSizeBinaryArray data should contain 1 buffer only (values)" - ); - let value_data = data.buffers()[0].as_ptr(); - let length = match data.data_type() { - DataType::FixedSizeBinary(len) => *len, - _ => panic!("Expected data type to be FixedSizeBinary"), - }; - Self { - data, - value_data: unsafe { RawPtrBox::new(value_data) }, - length, - } - } -} - -impl From for ArrayData { - fn from(array: FixedSizeBinaryArray) -> Self { - array.data - } -} - -/// Creates a `FixedSizeBinaryArray` from `FixedSizeList` array -impl From for FixedSizeBinaryArray { - fn from(v: FixedSizeListArray) -> Self { - assert_eq!( - v.data_ref().child_data()[0].child_data().len(), - 0, - "FixedSizeBinaryArray can only be created from list array of u8 values \ - (i.e. FixedSizeList>)." - ); - assert_eq!( - v.data_ref().child_data()[0].data_type(), - &DataType::UInt8, - "FixedSizeBinaryArray can only be created from FixedSizeList arrays, mismatched data types." - ); - - let builder = ArrayData::builder(DataType::FixedSizeBinary(v.value_length())) - .len(v.len()) - .add_buffer(v.data_ref().child_data()[0].buffers()[0].clone()) - .null_bit_buffer(v.data_ref().null_buffer().cloned()); - - let data = unsafe { builder.build_unchecked() }; - Self::from(data) - } -} - -impl From>> for FixedSizeBinaryArray { - fn from(v: Vec>) -> Self { - Self::try_from_sparse_iter(v.into_iter()).unwrap() - } -} - -impl From> for FixedSizeBinaryArray { - fn from(v: Vec<&[u8]>) -> Self { - Self::try_from_iter(v.into_iter()).unwrap() - } -} - -impl fmt::Debug for FixedSizeBinaryArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - -impl Array for FixedSizeBinaryArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn into_data(self) -> ArrayData { - self.into() - } -} - #[cfg(test)] mod tests { - use std::sync::Arc; - - use crate::{ - array::{LargeListArray, ListArray}, - datatypes::{Field, Schema}, - record_batch::RecordBatch, - }; - use super::*; + use crate::{array::ListArray, datatypes::Field}; #[test] fn test_binary_array() { @@ -903,37 +590,36 @@ mod tests { assert_eq!(7, binary_array.value_length(1)); } - #[test] - fn test_binary_array_from_list_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let values_data = ArrayData::builder(DataType::UInt8) + fn _test_generic_binary_array_from_list_array() { + let values = b"helloparquet"; + let child_data = ArrayData::builder(DataType::UInt8) .len(12) .add_buffer(Buffer::from(&values[..])) .build() .unwrap(); - let offsets: [i32; 4] = [0, 5, 5, 12]; + let offsets = [0, 5, 5, 12].map(|n| O::from_usize(n).unwrap()); // Array data: ["hello", "", "parquet"] - let array_data1 = ArrayData::builder(DataType::Binary) + let array_data1 = ArrayData::builder(GenericBinaryArray::::DATA_TYPE) .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) .add_buffer(Buffer::from_slice_ref(&values)) .build() .unwrap(); - let binary_array1 = BinaryArray::from(array_data1); + let binary_array1 = GenericBinaryArray::::from(array_data1); + + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); - let data_type = - DataType::List(Box::new(Field::new("item", DataType::UInt8, false))); let array_data2 = ArrayData::builder(data_type) .len(3) .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(values_data) + .add_child_data(child_data) .build() .unwrap(); - let list_array = ListArray::from(array_data2); - let binary_array2 = BinaryArray::from(list_array); + let list_array = GenericListArray::::from(array_data2); + let binary_array2 = GenericBinaryArray::::from(list_array); assert_eq!(2, binary_array2.data().buffers().len()); assert_eq!(0, binary_array2.data().child_data().len()); @@ -950,51 +636,98 @@ mod tests { } } + #[test] + fn test_binary_array_from_list_array() { + _test_generic_binary_array_from_list_array::(); + } + #[test] fn test_large_binary_array_from_list_array() { - let values: [u8; 12] = [ - b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't', - ]; - let values_data = ArrayData::builder(DataType::UInt8) - .len(12) + _test_generic_binary_array_from_list_array::(); + } + + fn _test_generic_binary_array_from_list_array_with_offset() { + let values = b"HelloArrowAndParquet"; + // b"ArrowAndParquet" + let child_data = ArrayData::builder(DataType::UInt8) + .len(15) + .offset(5) .add_buffer(Buffer::from(&values[..])) .build() .unwrap(); - let offsets: [i64; 4] = [0, 5, 5, 12]; - // Array data: ["hello", "", "parquet"] - let array_data1 = ArrayData::builder(DataType::LargeBinary) - .len(3) + let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); + let null_buffer = Buffer::from_slice_ref(&[0b101]); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); + + // [None, Some(b"Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) + .offset(1) .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_buffer(Buffer::from_slice_ref(&values)) + .null_bit_buffer(Some(null_buffer)) + .add_child_data(child_data) .build() .unwrap(); - let binary_array1 = LargeBinaryArray::from(array_data1); + let list_array = GenericListArray::::from(array_data); + let binary_array = GenericBinaryArray::::from(list_array); - let data_type = - DataType::LargeList(Box::new(Field::new("item", DataType::UInt8, false))); - let array_data2 = ArrayData::builder(data_type) - .len(3) + assert_eq!(2, binary_array.len()); + assert_eq!(1, binary_array.null_count()); + assert!(binary_array.is_null(0)); + assert!(binary_array.is_valid(1)); + assert_eq!(b"Parquet", binary_array.value(1)); + } + + #[test] + fn test_binary_array_from_list_array_with_offset() { + _test_generic_binary_array_from_list_array_with_offset::(); + } + + #[test] + fn test_large_binary_array_from_list_array_with_offset() { + _test_generic_binary_array_from_list_array_with_offset::(); + } + + fn _test_generic_binary_array_from_list_array_with_child_nulls_failed< + O: OffsetSizeTrait, + >() { + let values = b"HelloArrow"; + let child_data = ArrayData::builder(DataType::UInt8) + .len(10) + .add_buffer(Buffer::from(&values[..])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010]))) + .build() + .unwrap(); + + let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); + + // [None, Some(b"Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) .add_buffer(Buffer::from_slice_ref(&offsets)) - .add_child_data(values_data) + .add_child_data(child_data) .build() .unwrap(); - let list_array = LargeListArray::from(array_data2); - let binary_array2 = LargeBinaryArray::from(list_array); + let list_array = GenericListArray::::from(array_data); + drop(GenericBinaryArray::::from(list_array)); + } - assert_eq!(2, binary_array2.data().buffers().len()); - assert_eq!(0, binary_array2.data().child_data().len()); + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_binary_array_from_list_array_with_child_nulls_failed() { + _test_generic_binary_array_from_list_array_with_child_nulls_failed::(); + } - assert_eq!(binary_array1.len(), binary_array2.len()); - assert_eq!(binary_array1.null_count(), binary_array2.null_count()); - assert_eq!(binary_array1.value_offsets(), binary_array2.value_offsets()); - for i in 0..binary_array1.len() { - assert_eq!(binary_array1.value(i), binary_array2.value(i)); - assert_eq!(binary_array1.value(i), unsafe { - binary_array2.value_unchecked(i) - }); - assert_eq!(binary_array1.value_length(i), binary_array2.value_length(i)); - } + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_large_binary_array_from_list_array_with_child_nulls_failed() { + _test_generic_binary_array_from_list_array_with_child_nulls_failed::(); } fn test_generic_binary_array_from_opt_vec() { @@ -1074,87 +807,6 @@ mod tests { drop(BinaryArray::from(list_array)); } - #[test] - fn test_fixed_size_binary_array() { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build() - .unwrap(); - let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); - assert_eq!(3, fixed_size_binary_array.len()); - assert_eq!(0, fixed_size_binary_array.null_count()); - assert_eq!( - [b'h', b'e', b'l', b'l', b'o'], - fixed_size_binary_array.value(0) - ); - assert_eq!( - [b't', b'h', b'e', b'r', b'e'], - fixed_size_binary_array.value(1) - ); - assert_eq!( - [b'a', b'r', b'r', b'o', b'w'], - fixed_size_binary_array.value(2) - ); - assert_eq!(5, fixed_size_binary_array.value_length()); - assert_eq!(10, fixed_size_binary_array.value_offset(2)); - for i in 0..3 { - assert!(fixed_size_binary_array.is_valid(i)); - assert!(!fixed_size_binary_array.is_null(i)); - } - - // Test binary array with offset - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(2) - .offset(1) - .add_buffer(Buffer::from(&values[..])) - .build() - .unwrap(); - let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); - assert_eq!( - [b't', b'h', b'e', b'r', b'e'], - fixed_size_binary_array.value(0) - ); - assert_eq!( - [b'a', b'r', b'r', b'o', b'w'], - fixed_size_binary_array.value(1) - ); - assert_eq!(2, fixed_size_binary_array.len()); - assert_eq!(5, fixed_size_binary_array.value_offset(0)); - assert_eq!(5, fixed_size_binary_array.value_length()); - assert_eq!(10, fixed_size_binary_array.value_offset(1)); - } - - #[test] - #[should_panic( - expected = "FixedSizeBinaryArray can only be created from FixedSizeList arrays" - )] - // Different error messages, so skip for now - // https://github.com/apache/arrow-rs/issues/1545 - #[cfg(not(feature = "force_validate"))] - fn test_fixed_size_binary_array_from_incorrect_list_array() { - let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; - let values_data = ArrayData::builder(DataType::UInt32) - .len(12) - .add_buffer(Buffer::from_slice_ref(&values)) - .build() - .unwrap(); - - let array_data = unsafe { - ArrayData::builder(DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Binary, false)), - 4, - )) - .len(3) - .add_child_data(values_data) - .build_unchecked() - }; - let list_array = FixedSizeListArray::from(array_data); - drop(FixedSizeBinaryArray::from(list_array)); - } - #[test] #[should_panic(expected = "BinaryArray out of bounds access")] fn test_binary_array_get_value_index_out_of_bound() { @@ -1171,114 +823,6 @@ mod tests { binary_array.value(4); } - #[test] - fn test_binary_array_fmt_debug() { - let values: [u8; 15] = *b"hellotherearrow"; - - let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) - .len(3) - .add_buffer(Buffer::from(&values[..])) - .build() - .unwrap(); - let arr = FixedSizeBinaryArray::from(array_data); - assert_eq!( - "FixedSizeBinaryArray<5>\n[\n [104, 101, 108, 108, 111],\n [116, 104, 101, 114, 101],\n [97, 114, 114, 111, 119],\n]", - format!("{:?}", arr) - ); - } - - #[test] - fn test_fixed_size_binary_array_from_iter() { - let input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; - let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); - - assert_eq!(2, arr.value_length()); - assert_eq!(3, arr.len()) - } - - #[test] - fn test_all_none_fixed_size_binary_array_from_sparse_iter() { - let none_option: Option<[u8; 32]> = None; - let input_arg = vec![none_option, none_option, none_option]; - let arr = - FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); - assert_eq!(0, arr.value_length()); - assert_eq!(3, arr.len()) - } - - #[test] - fn test_fixed_size_binary_array_from_sparse_iter() { - let input_arg = vec![ - None, - Some(vec![7, 8]), - Some(vec![9, 10]), - None, - Some(vec![13, 14]), - ]; - let arr = - FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); - assert_eq!(2, arr.value_length()); - assert_eq!(5, arr.len()) - } - - #[test] - fn test_fixed_size_binary_array_from_vec() { - let values = vec!["one".as_bytes(), b"two", b"six", b"ten"]; - let array = FixedSizeBinaryArray::from(values); - assert_eq!(array.len(), 4); - assert_eq!(array.null_count(), 0); - assert_eq!(array.value(0), b"one"); - assert_eq!(array.value(1), b"two"); - assert_eq!(array.value(2), b"six"); - assert_eq!(array.value(3), b"ten"); - assert!(!array.is_null(0)); - assert!(!array.is_null(1)); - assert!(!array.is_null(2)); - assert!(!array.is_null(3)); - } - - #[test] - #[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")] - fn test_fixed_size_binary_array_from_vec_incorrect_length() { - let values = vec!["one".as_bytes(), b"two", b"three", b"four"]; - let _ = FixedSizeBinaryArray::from(values); - } - - #[test] - fn test_fixed_size_binary_array_from_opt_vec() { - let values = vec![ - Some("one".as_bytes()), - Some(b"two"), - None, - Some(b"six"), - Some(b"ten"), - ]; - let array = FixedSizeBinaryArray::from(values); - assert_eq!(array.len(), 5); - assert_eq!(array.value(0), b"one"); - assert_eq!(array.value(1), b"two"); - assert_eq!(array.value(3), b"six"); - assert_eq!(array.value(4), b"ten"); - assert!(!array.is_null(0)); - assert!(!array.is_null(1)); - assert!(array.is_null(2)); - assert!(!array.is_null(3)); - assert!(!array.is_null(4)); - } - - #[test] - #[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")] - fn test_fixed_size_binary_array_from_opt_vec_incorrect_length() { - let values = vec![ - Some("one".as_bytes()), - Some(b"two"), - None, - Some(b"three"), - Some(b"four"), - ]; - let _ = FixedSizeBinaryArray::from(values); - } - #[test] fn test_binary_array_all_null() { let data = vec![None]; @@ -1298,33 +842,4 @@ mod tests { .validate_full() .expect("All null array has valid array data"); } - - #[test] - fn fixed_size_binary_array_all_null() { - let data = vec![None] as Vec>; - let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(); - array - .data() - .validate_full() - .expect("All null array has valid array data"); - } - - #[test] - // Test for https://github.com/apache/arrow-rs/issues/1390 - #[should_panic( - expected = "column types must match schema types, expected FixedSizeBinary(2) but found FixedSizeBinary(0) at column index 0" - )] - fn fixed_size_binary_array_all_null_in_batch_with_schema() { - let schema = - Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); - - let none_option: Option<[u8; 2]> = None; - let item = FixedSizeBinaryArray::try_from_sparse_iter( - vec![none_option, none_option, none_option].into_iter(), - ) - .unwrap(); - - // Should not panic - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(item)]).unwrap(); - } } diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 473160858201..ed1d3102a13e 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ArrayAccessor, Decimal128Iter, Decimal256Iter}; +use crate::array::ArrayAccessor; +use num::BigInt; use std::borrow::Borrow; use std::convert::From; use std::fmt; @@ -24,14 +25,16 @@ use std::{any::Any, iter::FromIterator}; use super::{ array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray, }; -use super::{BooleanBufferBuilder, FixedSizeBinaryArray}; +use super::{BasicDecimalIter, BooleanBufferBuilder, FixedSizeBinaryArray}; #[allow(deprecated)] pub use crate::array::DecimalIter; -use crate::buffer::Buffer; -use crate::datatypes::{validate_decimal_precision, DECIMAL_DEFAULT_SCALE}; -use crate::datatypes::{DataType, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::{validate_decimal256_precision_with_lt_bytes, DataType}; +use crate::datatypes::{ + validate_decimal_precision, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE, +}; use crate::error::{ArrowError, Result}; -use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256}; +use crate::util::decimal::{BasicDecimal, Decimal256}; /// `Decimal128Array` stores fixed width decimal numbers, /// with a fixed precision and scale. @@ -56,7 +59,7 @@ use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256}; /// .with_precision_and_scale(23, 6) /// .unwrap(); /// -/// assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); +/// assert_eq!(&DataType::Decimal128(23, 6), decimal_array.data_type()); /// assert_eq!(8_887_000_000_i128, decimal_array.value(0).as_i128()); /// assert_eq!("8887.000000", decimal_array.value_as_string(0)); /// assert_eq!(3, decimal_array.len()); @@ -67,41 +70,41 @@ use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256}; /// assert_eq!(6, decimal_array.scale()); /// ``` /// -pub struct Decimal128Array { - data: ArrayData, - value_data: RawPtrBox, - precision: usize, - scale: usize, -} +pub type Decimal128Array = BasicDecimalArray<16>; + +pub type Decimal256Array = BasicDecimalArray<32>; -pub struct Decimal256Array { +pub struct BasicDecimalArray { data: ArrayData, value_data: RawPtrBox, precision: usize, scale: usize, } -mod private_decimal { - pub trait DecimalArrayPrivate { - fn raw_value_data_ptr(&self) -> *const u8; - } -} +impl BasicDecimalArray { + pub const VALUE_LENGTH: i32 = BYTE_WIDTH as i32; + const DEFAULT_TYPE: DataType = BasicDecimal::::DEFAULT_TYPE; + pub const MAX_PRECISION: usize = BasicDecimal::::MAX_PRECISION; + pub const MAX_SCALE: usize = BasicDecimal::::MAX_SCALE; + const TYPE_CONSTRUCTOR: fn(usize, usize) -> DataType = + BasicDecimal::::TYPE_CONSTRUCTOR; -pub trait BasicDecimalArray>: - private_decimal::DecimalArrayPrivate -{ - const VALUE_LENGTH: i32; - - fn data(&self) -> &ArrayData; + pub fn data(&self) -> &ArrayData { + &self.data + } /// Return the precision (total digits) that can be stored by this array - fn precision(&self) -> usize; + pub fn precision(&self) -> usize { + self.precision + } /// Return the scale (digits after the decimal) that can be stored by this array - fn scale(&self) -> usize; + pub fn scale(&self) -> usize { + self.scale + } /// Returns the element at index `i`. - fn value(&self, i: usize) -> T { + pub fn value(&self, i: usize) -> BasicDecimal { assert!(i < self.data().len(), "Out of bounds access"); unsafe { self.value_unchecked(i) } @@ -110,7 +113,7 @@ pub trait BasicDecimalArray>: /// Returns the element at index `i`. /// # Safety /// Caller is responsible for ensuring that the index is within the bounds of the array - unsafe fn value_unchecked(&self, i: usize) -> T { + pub unsafe fn value_unchecked(&self, i: usize) -> BasicDecimal { let data = self.data(); let offset = i + data.offset(); let raw_val = { @@ -119,15 +122,17 @@ pub trait BasicDecimalArray>: self.raw_value_data_ptr().offset(pos as isize), Self::VALUE_LENGTH as usize, ) + .try_into() + .unwrap() }; - T::new(self.precision(), self.scale(), raw_val) + BasicDecimal::::new(self.precision(), self.scale(), raw_val) } /// Returns the offset for the element at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - fn value_offset(&self, i: usize) -> i32 { + pub fn value_offset(&self, i: usize) -> i32 { self.value_offset_at(self.data().offset() + i) } @@ -135,22 +140,22 @@ pub trait BasicDecimalArray>: /// /// All elements have the same length as the array is a fixed size. #[inline] - fn value_length(&self) -> i32 { + pub fn value_length(&self) -> i32 { Self::VALUE_LENGTH } /// Returns a clone of the value data buffer - fn value_data(&self) -> Buffer { + pub fn value_data(&self) -> Buffer { self.data().buffers()[0].clone() } #[inline] - fn value_offset_at(&self, i: usize) -> i32 { + pub fn value_offset_at(&self, i: usize) -> i32 { Self::VALUE_LENGTH * i as i32 } #[inline] - fn value_as_string(&self, row: usize) -> String { + pub fn value_as_string(&self, row: usize) -> String { self.value(row).to_string() } @@ -158,11 +163,11 @@ pub trait BasicDecimalArray>: /// /// NB: This function does not validate that each value is in the permissible /// range for a decimal - fn from_fixed_size_binary_array( + pub fn from_fixed_size_binary_array( v: FixedSizeBinaryArray, precision: usize, scale: usize, - ) -> U { + ) -> Self { assert!( v.value_length() == Self::VALUE_LENGTH, "Value length of the array ({}) must equal to the byte width of the decimal ({})", @@ -170,43 +175,61 @@ pub trait BasicDecimalArray>: Self::VALUE_LENGTH, ); let data_type = if Self::VALUE_LENGTH == 16 { - DataType::Decimal(precision, scale) + DataType::Decimal128(precision, scale) } else { DataType::Decimal256(precision, scale) }; let builder = v.into_data().into_builder().data_type(data_type); let array_data = unsafe { builder.build_unchecked() }; - U::from(array_data) + Self::from(array_data) } /// Build a decimal array from [`FixedSizeListArray`]. /// /// NB: This function does not validate that each value is in the permissible - /// range for a decimal. And, the null buffer of the child array will be ignored. + /// range for a decimal. #[deprecated(note = "please use `from_fixed_size_binary_array` instead")] - fn from_fixed_size_list_array( + pub fn from_fixed_size_list_array( v: FixedSizeListArray, precision: usize, scale: usize, - ) -> U { + ) -> Self { + assert_eq!( + v.data_ref().child_data().len(), + 1, + "DecimalArray can only be created from list array of u8 values \ + (i.e. FixedSizeList>)." + ); let child_data = &v.data_ref().child_data()[0]; + assert_eq!( child_data.child_data().len(), 0, - "Decimal128Array can only be created from list array of u8 values \ + "DecimalArray can only be created from list array of u8 values \ (i.e. FixedSizeList>)." ); assert_eq!( child_data.data_type(), &DataType::UInt8, - "Decimal128Array can only be created from FixedSizeList arrays, mismatched data types." + "DecimalArray can only be created from FixedSizeList arrays, mismatched data types." + ); + assert!( + v.value_length() == Self::VALUE_LENGTH, + "Value length of the array ({}) must equal to the byte width of the decimal ({})", + v.value_length(), + Self::VALUE_LENGTH, + ); + assert_eq!( + v.data_ref().child_data()[0].null_count(), + 0, + "The child array cannot contain null values." ); let list_offset = v.offset(); let child_offset = child_data.offset(); let data_type = if Self::VALUE_LENGTH == 16 { - DataType::Decimal(precision, scale) + DataType::Decimal128(precision, scale) } else { DataType::Decimal256(precision, scale) }; @@ -217,39 +240,16 @@ pub trait BasicDecimalArray>: .offset(list_offset); let array_data = unsafe { builder.build_unchecked() }; - U::from(array_data) - } -} - -impl BasicDecimalArray for Decimal128Array { - const VALUE_LENGTH: i32 = 16; - - fn data(&self) -> &ArrayData { - &self.data - } - - fn precision(&self) -> usize { - self.precision - } - - fn scale(&self) -> usize { - self.scale + Self::from(array_data) } -} - -impl BasicDecimalArray for Decimal256Array { - const VALUE_LENGTH: i32 = 32; - fn data(&self) -> &ArrayData { - &self.data - } - - fn precision(&self) -> usize { - self.precision + /// The default precision and scale used when not specified. + pub const fn default_type() -> DataType { + Self::DEFAULT_TYPE } - fn scale(&self) -> usize { - self.scale + fn raw_value_data_ptr(&self) -> *const u8 { + self.value_data.as_ptr() } } @@ -272,28 +272,42 @@ impl Decimal128Array { Decimal128Array::from(data) } - /// Returns a Decimal128Array with the same data as self, with the + // Validates decimal values in this array can be properly interpreted + // with the specified precision. + fn validate_decimal_precision(&self, precision: usize) -> Result<()> { + (0..self.len()).try_for_each(|idx| { + if self.is_valid(idx) { + let decimal = unsafe { self.value_unchecked(idx) }; + validate_decimal_precision(decimal.as_i128(), precision) + } else { + Ok(()) + } + }) + } + + /// Returns a Decimal array with the same data as self, with the /// specified precision. /// /// Returns an Error if: - /// 1. `precision` is larger than [`DECIMAL128_MAX_PRECISION`] - /// 2. `scale` is larger than [`DECIMAL128_MAX_SCALE`]; + /// 1. `precision` is larger than [`Self::MAX_PRECISION`] + /// 2. `scale` is larger than [`Self::MAX_SCALE`]; /// 3. `scale` is > `precision` - pub fn with_precision_and_scale( - mut self, - precision: usize, - scale: usize, - ) -> Result { - if precision > DECIMAL128_MAX_PRECISION { + pub fn with_precision_and_scale(self, precision: usize, scale: usize) -> Result + where + Self: Sized, + { + if precision > Self::MAX_PRECISION { return Err(ArrowError::InvalidArgumentError(format!( "precision {} is greater than max {}", - precision, DECIMAL128_MAX_PRECISION + precision, + Self::MAX_PRECISION ))); } - if scale > DECIMAL128_MAX_SCALE { + if scale > Self::MAX_SCALE { return Err(ArrowError::InvalidArgumentError(format!( "scale {} is greater than max {}", - scale, DECIMAL128_MAX_SCALE + scale, + Self::MAX_SCALE ))); } if scale > precision { @@ -307,63 +321,100 @@ impl Decimal128Array { // precision. For performance, only check if the precision is // decreased if precision < self.precision { - for v in self.iter().flatten() { - validate_decimal_precision(v.as_i128(), precision)?; - } + self.validate_decimal_precision(precision)?; } - assert_eq!( - self.data.data_type(), - &DataType::Decimal(self.precision, self.scale) - ); + let data_type = Self::TYPE_CONSTRUCTOR(self.precision, self.scale); + assert_eq!(self.data().data_type(), &data_type); // safety: self.data is valid DataType::Decimal as checked above - let new_data_type = DataType::Decimal(precision, scale); - self.precision = precision; - self.scale = scale; - self.data = self.data.with_data_type(new_data_type); - Ok(self) - } + let new_data_type = Self::TYPE_CONSTRUCTOR(precision, scale); - /// The default precision and scale used when not specified. - pub fn default_type() -> DataType { - // Keep maximum precision - DataType::Decimal(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE) + Ok(self.data().clone().with_data_type(new_data_type).into()) } } -impl From for Decimal128Array { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 1, - "Decimal128Array data should contain 1 buffer only (values)" - ); - let values = data.buffers()[0].as_ptr(); - let (precision, scale) = match data.data_type() { - DataType::Decimal(precision, scale) => (*precision, *scale), - _ => panic!("Expected data type to be Decimal"), - }; - Self { - data, - value_data: unsafe { RawPtrBox::new(values) }, - precision, - scale, +impl Decimal256Array { + // Validates decimal values in this array can be properly interpreted + // with the specified precision. + fn validate_decimal_precision(&self, precision: usize) -> Result<()> { + (0..self.len()).try_for_each(|idx| { + if self.is_valid(idx) { + let raw_val = unsafe { + let pos = self.value_offset(idx); + std::slice::from_raw_parts( + self.raw_value_data_ptr().offset(pos as isize), + Self::VALUE_LENGTH as usize, + ) + }; + validate_decimal256_precision_with_lt_bytes(raw_val, precision) + } else { + Ok(()) + } + }) + } + + /// Returns a Decimal array with the same data as self, with the + /// specified precision. + /// + /// Returns an Error if: + /// 1. `precision` is larger than [`Self::MAX_PRECISION`] + /// 2. `scale` is larger than [`Self::MAX_SCALE`]; + /// 3. `scale` is > `precision` + pub fn with_precision_and_scale(self, precision: usize, scale: usize) -> Result + where + Self: Sized, + { + if precision > Self::MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "precision {} is greater than max {}", + precision, + Self::MAX_PRECISION + ))); } + if scale > Self::MAX_SCALE { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is greater than max {}", + scale, + Self::MAX_SCALE + ))); + } + if scale > precision { + return Err(ArrowError::InvalidArgumentError(format!( + "scale {} is greater than precision {}", + scale, precision + ))); + } + + // Ensure that all values are within the requested + // precision. For performance, only check if the precision is + // decreased + if precision < self.precision { + self.validate_decimal_precision(precision)?; + } + + let data_type = Self::TYPE_CONSTRUCTOR(self.precision, self.scale); + assert_eq!(self.data().data_type(), &data_type); + + // safety: self.data is valid DataType::Decimal as checked above + let new_data_type = Self::TYPE_CONSTRUCTOR(precision, scale); + + Ok(self.data().clone().with_data_type(new_data_type).into()) } } -impl From for Decimal256Array { +impl From for BasicDecimalArray { fn from(data: ArrayData) -> Self { assert_eq!( data.buffers().len(), 1, - "Decimal256Array data should contain 1 buffer only (values)" + "DecimalArray data should contain 1 buffer only (values)" ); let values = data.buffers()[0].as_ptr(); - let (precision, scale) = match data.data_type() { - DataType::Decimal256(precision, scale) => (*precision, *scale), - _ => panic!("Expected data type to be Decimal256"), + let (precision, scale) = match (data.data_type(), BYTE_WIDTH) { + (DataType::Decimal128(precision, scale), 16) + | (DataType::Decimal256(precision, scale), 32) => (*precision, *scale), + _ => panic!("Expected data type to be Decimal"), }; Self { data, @@ -384,6 +435,55 @@ impl<'a> Decimal128Array { } } +impl From for Decimal256 { + fn from(bigint: BigInt) -> Self { + Decimal256::from_big_int(&bigint, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE) + .unwrap() + } +} + +fn build_decimal_array_from( + null_buf: BooleanBufferBuilder, + buffer: Buffer, +) -> BasicDecimalArray { + let data = unsafe { + ArrayData::new_unchecked( + BasicDecimalArray::::default_type(), + null_buf.len(), + None, + Some(null_buf.into()), + 0, + vec![buffer], + vec![], + ) + }; + BasicDecimalArray::::from(data) +} + +impl> FromIterator> for Decimal256Array { + fn from_iter>>(iter: I) -> Self { + let iter = iter.into_iter(); + let (lower, upper) = iter.size_hint(); + let size_hint = upper.unwrap_or(lower); + + let mut null_buf = BooleanBufferBuilder::new(size_hint); + + let mut buffer = MutableBuffer::with_capacity(size_hint); + + iter.for_each(|item| { + if let Some(a) = item { + null_buf.append(true); + buffer.extend_from_slice(Into::into(a).raw_value()); + } else { + null_buf.append(false); + buffer.extend_zeros(32); + } + }); + + build_decimal_array_from::<32>(null_buf, buffer.into()) + } +} + impl>> FromIterator for Decimal128Array { fn from_iter>(iter: I) -> Self { let iter = iter.into_iter(); @@ -405,107 +505,75 @@ impl>> FromIterator for Decimal128Array { }) .collect(); - let data = unsafe { - ArrayData::new_unchecked( - Self::default_type(), - null_buf.len(), - None, - Some(null_buf.into()), - 0, - vec![buffer], - vec![], - ) - }; - Decimal128Array::from(data) + build_decimal_array_from::<16>(null_buf, buffer) } } -macro_rules! def_decimal_array { - ($ty:ident, $array_name:expr, $decimal_ty:ident, $iter_ty:ident) => { - impl private_decimal::DecimalArrayPrivate for $ty { - fn raw_value_data_ptr(&self) -> *const u8 { - self.value_data.as_ptr() - } - } - - impl Array for $ty { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } +impl Array for BasicDecimalArray { + fn as_any(&self) -> &dyn Any { + self + } - fn into_data(self) -> ArrayData { - self.into() - } - } + fn data(&self) -> &ArrayData { + &self.data + } - impl From<$ty> for ArrayData { - fn from(array: $ty) -> Self { - array.data - } - } + fn into_data(self) -> ArrayData { + self.into() + } +} - impl fmt::Debug for $ty { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{}<{}, {}>\n[\n", - $array_name, self.precision, self.scale - )?; - print_long_array(self, f, |array, index, f| { - let formatted_decimal = array.value_as_string(index); - - write!(f, "{}", formatted_decimal) - })?; - write!(f, "]") - } - } +impl From> for ArrayData { + fn from(array: BasicDecimalArray) -> Self { + array.data + } +} - impl<'a> ArrayAccessor for &'a $ty { - type Item = $decimal_ty; +impl fmt::Debug for BasicDecimalArray { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "Decimal{}Array<{}, {}>\n[\n", + BYTE_WIDTH * 8, + self.precision, + self.scale + )?; + print_long_array(self, f, |array, index, f| { + let formatted_decimal = array.value_as_string(index); + + write!(f, "{}", formatted_decimal) + })?; + write!(f, "]") + } +} - fn value(&self, index: usize) -> Self::Item { - $ty::value(self, index) - } +impl<'a, const BYTE_WIDTH: usize> ArrayAccessor for &'a BasicDecimalArray { + type Item = BasicDecimal; - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - $ty::value_unchecked(self, index) - } - } + fn value(&self, index: usize) -> Self::Item { + BasicDecimalArray::::value(self, index) + } - impl<'a> IntoIterator for &'a $ty { - type Item = Option<$decimal_ty>; - type IntoIter = $iter_ty<'a>; + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + BasicDecimalArray::::value_unchecked(self, index) + } +} - fn into_iter(self) -> Self::IntoIter { - $iter_ty::<'a>::new(self) - } - } +impl<'a, const BYTE_WIDTH: usize> IntoIterator for &'a BasicDecimalArray { + type Item = Option>; + type IntoIter = BasicDecimalIter<'a, BYTE_WIDTH>; - impl<'a> $ty { - /// constructs a new iterator - pub fn iter(&'a self) -> $iter_ty<'a> { - $iter_ty::<'a>::new(self) - } - } - }; + fn into_iter(self) -> Self::IntoIter { + BasicDecimalIter::<'a, BYTE_WIDTH>::new(self) + } } -def_decimal_array!( - Decimal128Array, - "Decimal128Array", - Decimal128, - Decimal128Iter -); -def_decimal_array!( - Decimal256Array, - "Decimal256Array", - Decimal256, - Decimal256Iter -); +impl<'a, const BYTE_WIDTH: usize> BasicDecimalArray { + /// constructs a new iterator + pub fn iter(&'a self) -> BasicDecimalIter<'a, BYTE_WIDTH> { + BasicDecimalIter::<'a, BYTE_WIDTH>::new(self) + } +} #[cfg(test)] mod tests { @@ -523,7 +591,7 @@ mod tests { 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]; - let array_data = ArrayData::builder(DataType::Decimal(38, 6)) + let array_data = ArrayData::builder(DataType::Decimal128(38, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) .build() @@ -541,7 +609,7 @@ mod tests { let mut result = decimal_builder.append_value(123456); let mut error = result.unwrap_err(); assert_eq!( - "Invalid argument error: 123456 is too large to store in a Decimal of precision 5. Max is 99999", + "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", error.to_string() ); @@ -558,7 +626,7 @@ mod tests { result = decimal_builder.append_value(100); error = result.unwrap_err(); assert_eq!( - "Invalid argument error: 100 is too large to store in a Decimal of precision 2. Max is 99", + "Invalid argument error: 100 is too large to store in a Decimal128 of precision 2. Max is 99", error.to_string() ); @@ -580,7 +648,7 @@ mod tests { fn test_decimal_from_iter_values() { let array = Decimal128Array::from_iter_values(vec![-100, 0, 101].into_iter()); assert_eq!(array.len(), 3); - assert_eq!(array.data_type(), &DataType::Decimal(38, 10)); + assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); assert_eq!(-100_i128, array.value(0).into()); assert!(!array.is_null(0)); assert_eq!(0_i128, array.value(1).into()); @@ -594,7 +662,7 @@ mod tests { let array: Decimal128Array = vec![Some(-100), None, Some(101)].into_iter().collect(); assert_eq!(array.len(), 3); - assert_eq!(array.data_type(), &DataType::Decimal(38, 10)); + assert_eq!(array.data_type(), &DataType::Decimal128(38, 10)); assert_eq!(-100_i128, array.value(0).into()); assert!(!array.is_null(0)); assert!(array.is_null(1)); @@ -665,7 +733,7 @@ mod tests { .with_precision_and_scale(20, 2) .unwrap(); - assert_eq!(arr.data_type(), &DataType::Decimal(20, 2)); + assert_eq!(arr.data_type(), &DataType::Decimal128(20, 2)); assert_eq!(arr.precision(), 20); assert_eq!(arr.scale(), 2); @@ -677,7 +745,7 @@ mod tests { #[test] #[should_panic( - expected = "-123223423432432 is too small to store in a Decimal of precision 5. Min is -99999" + expected = "-123223423432432 is too small to store in a Decimal128 of precision 5. Min is -99999" )] fn test_decimal_array_with_precision_and_scale_out_of_range() { Decimal128Array::from_iter_values([12345, 456, 7890, -123223423432432]) @@ -792,9 +860,64 @@ mod tests { assert_eq!(decimal.value_as_string(1), "56".to_string()); } + #[test] + #[allow(deprecated)] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_decimal_array_from_fixed_size_list_with_child_nulls_failed() { + let value_data = ArrayData::builder(DataType::UInt8) + .len(16) + .add_buffer(Buffer::from_slice_ref(&[12_i128])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010]))) + .build() + .unwrap(); + + // Construct a list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 16, + ); + let list_data = ArrayData::builder(list_data_type) + .len(1) + .add_child_data(value_data) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + drop(Decimal128Array::from_fixed_size_list_array( + list_array, 38, 0, + )); + } + + #[test] + #[allow(deprecated)] + #[should_panic( + expected = "Value length of the array (8) must equal to the byte width of the decimal (16)" + )] + fn test_decimal_array_from_fixed_size_list_with_wrong_length() { + let value_data = ArrayData::builder(DataType::UInt8) + .len(16) + .add_buffer(Buffer::from_slice_ref(&[12_i128])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010101010]))) + .build() + .unwrap(); + + // Construct a list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 8, + ); + let list_data = ArrayData::builder(list_data_type) + .len(2) + .add_child_data(value_data) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + drop(Decimal128Array::from_fixed_size_list_array( + list_array, 38, 0, + )); + } + #[test] fn test_decimal256_iter() { - // TODO: Impl FromIterator for Decimal256Array let mut builder = Decimal256Builder::new(30, 76, 6); let value = BigInt::from_str_radix("12345", 10).unwrap(); let decimal1 = Decimal256::from_big_int(&value, 76, 6).unwrap(); @@ -811,4 +934,38 @@ mod tests { let collected: Vec<_> = array.iter().collect(); assert_eq!(vec![Some(decimal1), None, Some(decimal2)], collected); } + + #[test] + fn test_from_iter_decimal256array() { + let value1 = BigInt::from_str_radix("12345", 10).unwrap(); + let value2 = BigInt::from_str_radix("56789", 10).unwrap(); + + let array: Decimal256Array = + vec![Some(value1.clone()), None, Some(value2.clone())] + .into_iter() + .collect(); + assert_eq!(array.len(), 3); + assert_eq!(array.data_type(), &DataType::Decimal256(76, 10)); + assert_eq!( + Decimal256::from_big_int( + &value1, + DECIMAL256_MAX_PRECISION, + DECIMAL_DEFAULT_SCALE + ) + .unwrap(), + array.value(0) + ); + assert!(!array.is_null(0)); + assert!(array.is_null(1)); + assert_eq!( + Decimal256::from_big_int( + &value2, + DECIMAL256_MAX_PRECISION, + DECIMAL_DEFAULT_SCALE + ) + .unwrap(), + array.value(2) + ); + assert!(!array.is_null(2)); + } } diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs index 4f7d5f9c147b..2acb51750d17 100644 --- a/arrow/src/array/array_dictionary.rs +++ b/arrow/src/array/array_dictionary.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::array::{ArrayAccessor, ArrayIter}; use std::any::Any; use std::fmt; use std::iter::IntoIterator; @@ -234,6 +235,28 @@ impl DictionaryArray { .expect("Dictionary index not usize") }) } + + /// Downcast this dictionary to a [`TypedDictionaryArray`] + /// + /// ``` + /// use arrow::array::{Array, ArrayAccessor, DictionaryArray, StringArray}; + /// use arrow::datatypes::Int32Type; + /// + /// let orig = [Some("a"), Some("b"), None]; + /// let dictionary = DictionaryArray::::from_iter(orig); + /// let typed = dictionary.downcast_dict::().unwrap(); + /// assert_eq!(typed.value(0), "a"); + /// assert_eq!(typed.value(1), "b"); + /// assert!(typed.is_null(2)); + /// ``` + /// + pub fn downcast_dict(&self) -> Option> { + let values = self.values.as_any().downcast_ref()?; + Some(TypedDictionaryArray { + dictionary: self, + values, + }) + } } /// Constructs a `DictionaryArray` from an array data reference. @@ -302,9 +325,7 @@ impl From> for ArrayData { /// format!("{:?}", array) /// ); /// ``` -impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator> - for DictionaryArray -{ +impl<'a, T: ArrowDictionaryKeyType> FromIterator> for DictionaryArray { fn from_iter>>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); @@ -342,9 +363,7 @@ impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator FromIterator<&'a str> - for DictionaryArray -{ +impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray { fn from_iter>(iter: I) -> Self { let it = iter.into_iter(); let (lower, _) = it.size_hint(); @@ -385,6 +404,111 @@ impl fmt::Debug for DictionaryArray { } } +/// A strongly-typed wrapper around a [`DictionaryArray`] that implements [`ArrayAccessor`] +/// allowing fast access to its elements +/// +/// ``` +/// use arrow::array::{ArrayIter, DictionaryArray, StringArray}; +/// use arrow::datatypes::Int32Type; +/// +/// let orig = ["a", "b", "a", "b"]; +/// let dictionary = DictionaryArray::::from_iter(orig); +/// +/// // `TypedDictionaryArray` allows you to access the values directly +/// let typed = dictionary.downcast_dict::().unwrap(); +/// +/// for (maybe_val, orig) in typed.into_iter().zip(orig) { +/// assert_eq!(maybe_val.unwrap(), orig) +/// } +/// ``` +pub struct TypedDictionaryArray<'a, K: ArrowPrimitiveType, V> { + /// The dictionary array + dictionary: &'a DictionaryArray, + /// The values of the dictionary + values: &'a V, +} + +// Manually implement `Clone` to avoid `V: Clone` type constraint +impl<'a, K: ArrowPrimitiveType, V> Clone for TypedDictionaryArray<'a, K, V> { + fn clone(&self) -> Self { + Self { + dictionary: self.dictionary, + values: self.values, + } + } +} + +impl<'a, K: ArrowPrimitiveType, V> Copy for TypedDictionaryArray<'a, K, V> {} + +impl<'a, K: ArrowPrimitiveType, V> fmt::Debug for TypedDictionaryArray<'a, K, V> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "TypedDictionaryArray({:?})", self.dictionary) + } +} + +impl<'a, K: ArrowPrimitiveType, V> TypedDictionaryArray<'a, K, V> { + /// Returns the keys of this [`TypedDictionaryArray`] + pub fn keys(&self) -> &'a PrimitiveArray { + self.dictionary.keys() + } + + /// Returns the values of this [`TypedDictionaryArray`] + pub fn values(&self) -> &'a V { + self.values + } +} + +impl<'a, K: ArrowPrimitiveType, V: Sync> Array for TypedDictionaryArray<'a, K, V> { + fn as_any(&self) -> &dyn Any { + self.dictionary + } + + fn data(&self) -> &ArrayData { + &self.dictionary.data + } + + fn into_data(self) -> ArrayData { + self.dictionary.into_data() + } +} + +impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V> +where + K: ArrowPrimitiveType, + V: Sync + Send, + &'a V: ArrayAccessor, +{ + type Item = Option<::Item>; + type IntoIter = ArrayIter; + + fn into_iter(self) -> Self::IntoIter { + ArrayIter::new(self) + } +} + +impl<'a, K, V> ArrayAccessor for TypedDictionaryArray<'a, K, V> +where + K: ArrowPrimitiveType, + V: Sync + Send, + &'a V: ArrayAccessor, +{ + type Item = <&'a V as ArrayAccessor>::Item; + + fn value(&self, index: usize) -> Self::Item { + assert!(self.dictionary.is_valid(index), "{}", index); + let value_idx = self.dictionary.keys.value(index).to_usize().unwrap(); + // Dictionary indexes should be valid + unsafe { self.values.value_unchecked(value_idx) } + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + let val = self.dictionary.keys.value_unchecked(index); + let value_idx = val.to_usize().unwrap(); + // Dictionary indexes should be valid + self.values.value_unchecked(value_idx) + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/array/array_fixed_size_binary.rs b/arrow/src/array/array_fixed_size_binary.rs new file mode 100644 index 000000000000..a811917c727c --- /dev/null +++ b/arrow/src/array/array_fixed_size_binary.rs @@ -0,0 +1,648 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::convert::From; +use std::fmt; + +use super::{ + array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray, +}; +use crate::buffer::Buffer; +use crate::error::{ArrowError, Result}; +use crate::util::bit_util; +use crate::{buffer::MutableBuffer, datatypes::DataType}; + +/// An array where each element is a fixed-size sequence of bytes. +/// +/// # Examples +/// +/// Create an array from an iterable argument of byte slices. +/// +/// ``` +/// use arrow::array::{Array, FixedSizeBinaryArray}; +/// let input_arg = vec![ vec![1, 2], vec![3, 4], vec![5, 6] ]; +/// let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); +/// +/// assert_eq!(3, arr.len()); +/// +/// ``` +/// Create an array from an iterable argument of sparse byte slices. +/// Sparsity means that the input argument can contain `None` items. +/// ``` +/// use arrow::array::{Array, FixedSizeBinaryArray}; +/// let input_arg = vec![ None, Some(vec![7, 8]), Some(vec![9, 10]), None, Some(vec![13, 14]) ]; +/// let arr = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); +/// assert_eq!(5, arr.len()) +/// +/// ``` +/// +pub struct FixedSizeBinaryArray { + data: ArrayData, + value_data: RawPtrBox, + length: i32, +} + +impl FixedSizeBinaryArray { + /// Returns the element at index `i` as a byte slice. + pub fn value(&self, i: usize) -> &[u8] { + assert!( + i < self.data.len(), + "FixedSizeBinaryArray out of bounds access" + ); + let offset = i + self.data.offset(); + unsafe { + let pos = self.value_offset_at(offset); + std::slice::from_raw_parts( + self.value_data.as_ptr().offset(pos as isize), + (self.value_offset_at(offset + 1) - pos) as usize, + ) + } + } + + /// Returns the element at index `i` as a byte slice. + /// # Safety + /// Caller is responsible for ensuring that the index is within the bounds of the array + pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] { + let offset = i + self.data.offset(); + let pos = self.value_offset_at(offset); + std::slice::from_raw_parts( + self.value_data.as_ptr().offset(pos as isize), + (self.value_offset_at(offset + 1) - pos) as usize, + ) + } + + /// Returns the offset for the element at index `i`. + /// + /// Note this doesn't do any bound checking, for performance reason. + #[inline] + pub fn value_offset(&self, i: usize) -> i32 { + self.value_offset_at(self.data.offset() + i) + } + + /// Returns the length for an element. + /// + /// All elements have the same length as the array is a fixed size. + #[inline] + pub fn value_length(&self) -> i32 { + self.length + } + + /// Returns a clone of the value data buffer + pub fn value_data(&self) -> Buffer { + self.data.buffers()[0].clone() + } + + /// Create an array from an iterable argument of sparse byte slices. + /// Sparsity means that items returned by the iterator are optional, i.e input argument can + /// contain `None` items. + /// + /// # Examples + /// + /// ``` + /// use arrow::array::FixedSizeBinaryArray; + /// let input_arg = vec![ + /// None, + /// Some(vec![7, 8]), + /// Some(vec![9, 10]), + /// None, + /// Some(vec![13, 14]), + /// None, + /// ]; + /// let array = FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); + /// ``` + /// + /// # Errors + /// + /// Returns error if argument has length zero, or sizes of nested slices don't match. + pub fn try_from_sparse_iter(mut iter: T) -> Result + where + T: Iterator>, + U: AsRef<[u8]>, + { + let mut len = 0; + let mut size = None; + let mut byte = 0; + let mut null_buf = MutableBuffer::from_len_zeroed(0); + let mut buffer = MutableBuffer::from_len_zeroed(0); + let mut prepend = 0; + iter.try_for_each(|item| -> Result<()> { + // extend null bitmask by one byte per each 8 items + if byte == 0 { + null_buf.push(0u8); + byte = 8; + } + byte -= 1; + + if let Some(slice) = item { + let slice = slice.as_ref(); + if let Some(size) = size { + if size != slice.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Nested array size mismatch: one is {}, and the other is {}", + size, + slice.len() + ))); + } + } else { + size = Some(slice.len()); + buffer.extend_zeros(slice.len() * prepend); + } + bit_util::set_bit(null_buf.as_slice_mut(), len); + buffer.extend_from_slice(slice); + } else if let Some(size) = size { + buffer.extend_zeros(size); + } else { + prepend += 1; + } + + len += 1; + + Ok(()) + })?; + + if len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Input iterable argument has no data".to_owned(), + )); + } + + let size = size.unwrap_or(0); + let array_data = unsafe { + ArrayData::new_unchecked( + DataType::FixedSizeBinary(size as i32), + len, + None, + Some(null_buf.into()), + 0, + vec![buffer.into()], + vec![], + ) + }; + Ok(FixedSizeBinaryArray::from(array_data)) + } + + /// Create an array from an iterable argument of byte slices. + /// + /// # Examples + /// + /// ``` + /// use arrow::array::FixedSizeBinaryArray; + /// let input_arg = vec![ + /// vec![1, 2], + /// vec![3, 4], + /// vec![5, 6], + /// ]; + /// let array = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); + /// ``` + /// + /// # Errors + /// + /// Returns error if argument has length zero, or sizes of nested slices don't match. + pub fn try_from_iter(mut iter: T) -> Result + where + T: Iterator, + U: AsRef<[u8]>, + { + let mut len = 0; + let mut size = None; + let mut buffer = MutableBuffer::from_len_zeroed(0); + iter.try_for_each(|item| -> Result<()> { + let slice = item.as_ref(); + if let Some(size) = size { + if size != slice.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Nested array size mismatch: one is {}, and the other is {}", + size, + slice.len() + ))); + } + } else { + size = Some(slice.len()); + } + buffer.extend_from_slice(slice); + + len += 1; + + Ok(()) + })?; + + if len == 0 { + return Err(ArrowError::InvalidArgumentError( + "Input iterable argument has no data".to_owned(), + )); + } + + let size = size.unwrap_or(0); + let array_data = ArrayData::builder(DataType::FixedSizeBinary(size as i32)) + .len(len) + .add_buffer(buffer.into()); + let array_data = unsafe { array_data.build_unchecked() }; + Ok(FixedSizeBinaryArray::from(array_data)) + } + + #[inline] + fn value_offset_at(&self, i: usize) -> i32 { + self.length * i as i32 + } +} + +impl From for FixedSizeBinaryArray { + fn from(data: ArrayData) -> Self { + assert_eq!( + data.buffers().len(), + 1, + "FixedSizeBinaryArray data should contain 1 buffer only (values)" + ); + let value_data = data.buffers()[0].as_ptr(); + let length = match data.data_type() { + DataType::FixedSizeBinary(len) => *len, + _ => panic!("Expected data type to be FixedSizeBinary"), + }; + Self { + data, + value_data: unsafe { RawPtrBox::new(value_data) }, + length, + } + } +} + +impl From for ArrayData { + fn from(array: FixedSizeBinaryArray) -> Self { + array.data + } +} + +/// Creates a `FixedSizeBinaryArray` from `FixedSizeList` array +impl From for FixedSizeBinaryArray { + fn from(v: FixedSizeListArray) -> Self { + assert_eq!( + v.data_ref().child_data().len(), + 1, + "FixedSizeBinaryArray can only be created from list array of u8 values \ + (i.e. FixedSizeList>)." + ); + let child_data = &v.data_ref().child_data()[0]; + + assert_eq!( + child_data.child_data().len(), + 0, + "FixedSizeBinaryArray can only be created from list array of u8 values \ + (i.e. FixedSizeList>)." + ); + assert_eq!( + child_data.data_type(), + &DataType::UInt8, + "FixedSizeBinaryArray can only be created from FixedSizeList arrays, mismatched data types." + ); + assert_eq!( + child_data.null_count(), + 0, + "The child array cannot contain null values." + ); + + let builder = ArrayData::builder(DataType::FixedSizeBinary(v.value_length())) + .len(v.len()) + .offset(v.offset()) + .add_buffer(child_data.buffers()[0].slice(child_data.offset())) + .null_bit_buffer(v.data_ref().null_buffer().cloned()); + + let data = unsafe { builder.build_unchecked() }; + Self::from(data) + } +} + +impl From>> for FixedSizeBinaryArray { + fn from(v: Vec>) -> Self { + Self::try_from_sparse_iter(v.into_iter()).unwrap() + } +} + +impl From> for FixedSizeBinaryArray { + fn from(v: Vec<&[u8]>) -> Self { + Self::try_from_iter(v.into_iter()).unwrap() + } +} + +impl fmt::Debug for FixedSizeBinaryArray { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "FixedSizeBinaryArray<{}>\n[\n", self.value_length())?; + print_long_array(self, f, |array, index, f| { + fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +impl Array for FixedSizeBinaryArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn data(&self) -> &ArrayData { + &self.data + } + + fn into_data(self) -> ArrayData { + self.into() + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use crate::{ + datatypes::{Field, Schema}, + record_batch::RecordBatch, + }; + + use super::*; + + #[test] + fn test_fixed_size_binary_array() { + let values: [u8; 15] = *b"hellotherearrow"; + + let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) + .len(3) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); + assert_eq!(3, fixed_size_binary_array.len()); + assert_eq!(0, fixed_size_binary_array.null_count()); + assert_eq!( + [b'h', b'e', b'l', b'l', b'o'], + fixed_size_binary_array.value(0) + ); + assert_eq!( + [b't', b'h', b'e', b'r', b'e'], + fixed_size_binary_array.value(1) + ); + assert_eq!( + [b'a', b'r', b'r', b'o', b'w'], + fixed_size_binary_array.value(2) + ); + assert_eq!(5, fixed_size_binary_array.value_length()); + assert_eq!(10, fixed_size_binary_array.value_offset(2)); + for i in 0..3 { + assert!(fixed_size_binary_array.is_valid(i)); + assert!(!fixed_size_binary_array.is_null(i)); + } + + // Test binary array with offset + let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) + .len(2) + .offset(1) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + let fixed_size_binary_array = FixedSizeBinaryArray::from(array_data); + assert_eq!( + [b't', b'h', b'e', b'r', b'e'], + fixed_size_binary_array.value(0) + ); + assert_eq!( + [b'a', b'r', b'r', b'o', b'w'], + fixed_size_binary_array.value(1) + ); + assert_eq!(2, fixed_size_binary_array.len()); + assert_eq!(5, fixed_size_binary_array.value_offset(0)); + assert_eq!(5, fixed_size_binary_array.value_length()); + assert_eq!(10, fixed_size_binary_array.value_offset(1)); + } + + #[test] + fn test_fixed_size_binary_array_from_fixed_size_list_array() { + let values = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]; + let values_data = ArrayData::builder(DataType::UInt8) + .len(12) + .offset(2) + .add_buffer(Buffer::from_slice_ref(&values)) + .build() + .unwrap(); + // [null, [10, 11, 12, 13]] + let array_data = unsafe { + ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 4, + )) + .len(2) + .offset(1) + .add_child_data(values_data) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b101]))) + .build_unchecked() + }; + let list_array = FixedSizeListArray::from(array_data); + let binary_array = FixedSizeBinaryArray::from(list_array); + + assert_eq!(2, binary_array.len()); + assert_eq!(1, binary_array.null_count()); + assert!(binary_array.is_null(0)); + assert!(binary_array.is_valid(1)); + assert_eq!(&[10, 11, 12, 13], binary_array.value(1)); + } + + #[test] + #[should_panic( + expected = "FixedSizeBinaryArray can only be created from FixedSizeList arrays" + )] + // Different error messages, so skip for now + // https://github.com/apache/arrow-rs/issues/1545 + #[cfg(not(feature = "force_validate"))] + fn test_fixed_size_binary_array_from_incorrect_fixed_size_list_array() { + let values: [u32; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + let values_data = ArrayData::builder(DataType::UInt32) + .len(12) + .add_buffer(Buffer::from_slice_ref(&values)) + .build() + .unwrap(); + + let array_data = unsafe { + ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Binary, false)), + 4, + )) + .len(3) + .add_child_data(values_data) + .build_unchecked() + }; + let list_array = FixedSizeListArray::from(array_data); + drop(FixedSizeBinaryArray::from(list_array)); + } + + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_fixed_size_binary_array_from_fixed_size_list_array_with_child_nulls_failed() { + let values = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + let values_data = ArrayData::builder(DataType::UInt8) + .len(12) + .add_buffer(Buffer::from_slice_ref(&values)) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b101010101010]))) + .build() + .unwrap(); + + let array_data = unsafe { + ArrayData::builder(DataType::FixedSizeList( + Box::new(Field::new("item", DataType::UInt8, false)), + 4, + )) + .len(3) + .add_child_data(values_data) + .build_unchecked() + }; + let list_array = FixedSizeListArray::from(array_data); + drop(FixedSizeBinaryArray::from(list_array)); + } + + #[test] + fn test_fixed_size_binary_array_fmt_debug() { + let values: [u8; 15] = *b"hellotherearrow"; + + let array_data = ArrayData::builder(DataType::FixedSizeBinary(5)) + .len(3) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + let arr = FixedSizeBinaryArray::from(array_data); + assert_eq!( + "FixedSizeBinaryArray<5>\n[\n [104, 101, 108, 108, 111],\n [116, 104, 101, 114, 101],\n [97, 114, 114, 111, 119],\n]", + format!("{:?}", arr) + ); + } + + #[test] + fn test_fixed_size_binary_array_from_iter() { + let input_arg = vec![vec![1, 2], vec![3, 4], vec![5, 6]]; + let arr = FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap(); + + assert_eq!(2, arr.value_length()); + assert_eq!(3, arr.len()) + } + + #[test] + fn test_all_none_fixed_size_binary_array_from_sparse_iter() { + let none_option: Option<[u8; 32]> = None; + let input_arg = vec![none_option, none_option, none_option]; + let arr = + FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); + assert_eq!(0, arr.value_length()); + assert_eq!(3, arr.len()) + } + + #[test] + fn test_fixed_size_binary_array_from_sparse_iter() { + let input_arg = vec![ + None, + Some(vec![7, 8]), + Some(vec![9, 10]), + None, + Some(vec![13, 14]), + ]; + let arr = + FixedSizeBinaryArray::try_from_sparse_iter(input_arg.into_iter()).unwrap(); + assert_eq!(2, arr.value_length()); + assert_eq!(5, arr.len()) + } + + #[test] + fn test_fixed_size_binary_array_from_vec() { + let values = vec!["one".as_bytes(), b"two", b"six", b"ten"]; + let array = FixedSizeBinaryArray::from(values); + assert_eq!(array.len(), 4); + assert_eq!(array.null_count(), 0); + assert_eq!(array.value(0), b"one"); + assert_eq!(array.value(1), b"two"); + assert_eq!(array.value(2), b"six"); + assert_eq!(array.value(3), b"ten"); + assert!(!array.is_null(0)); + assert!(!array.is_null(1)); + assert!(!array.is_null(2)); + assert!(!array.is_null(3)); + } + + #[test] + #[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")] + fn test_fixed_size_binary_array_from_vec_incorrect_length() { + let values = vec!["one".as_bytes(), b"two", b"three", b"four"]; + let _ = FixedSizeBinaryArray::from(values); + } + + #[test] + fn test_fixed_size_binary_array_from_opt_vec() { + let values = vec![ + Some("one".as_bytes()), + Some(b"two"), + None, + Some(b"six"), + Some(b"ten"), + ]; + let array = FixedSizeBinaryArray::from(values); + assert_eq!(array.len(), 5); + assert_eq!(array.value(0), b"one"); + assert_eq!(array.value(1), b"two"); + assert_eq!(array.value(3), b"six"); + assert_eq!(array.value(4), b"ten"); + assert!(!array.is_null(0)); + assert!(!array.is_null(1)); + assert!(array.is_null(2)); + assert!(!array.is_null(3)); + assert!(!array.is_null(4)); + } + + #[test] + #[should_panic(expected = "Nested array size mismatch: one is 3, and the other is 5")] + fn test_fixed_size_binary_array_from_opt_vec_incorrect_length() { + let values = vec![ + Some("one".as_bytes()), + Some(b"two"), + None, + Some(b"three"), + Some(b"four"), + ]; + let _ = FixedSizeBinaryArray::from(values); + } + + #[test] + fn fixed_size_binary_array_all_null() { + let data = vec![None] as Vec>; + let array = FixedSizeBinaryArray::try_from_sparse_iter(data.into_iter()).unwrap(); + array + .data() + .validate_full() + .expect("All null array has valid array data"); + } + + #[test] + // Test for https://github.com/apache/arrow-rs/issues/1390 + #[should_panic( + expected = "column types must match schema types, expected FixedSizeBinary(2) but found FixedSizeBinary(0) at column index 0" + )] + fn fixed_size_binary_array_all_null_in_batch_with_schema() { + let schema = + Schema::new(vec![Field::new("a", DataType::FixedSizeBinary(2), true)]); + + let none_option: Option<[u8; 2]> = None; + let item = FixedSizeBinaryArray::try_from_sparse_iter( + vec![none_option, none_option, none_option].into_iter(), + ) + .unwrap(); + + // Should not panic + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(item)]).unwrap(); + } +} diff --git a/arrow/src/array/array_fixed_size_list.rs b/arrow/src/array/array_fixed_size_list.rs new file mode 100644 index 000000000000..fc568d54a831 --- /dev/null +++ b/arrow/src/array/array_fixed_size_list.rs @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::fmt; + +use super::{array::print_long_array, make_array, Array, ArrayData, ArrayRef}; +use crate::array::array::ArrayAccessor; +use crate::datatypes::DataType; + +/// A list array where each element is a fixed-size sequence of values with the same +/// type whose maximum length is represented by a i32. +/// +/// # Example +/// +/// ``` +/// # use arrow::array::{Array, ArrayData, FixedSizeListArray, Int32Array}; +/// # use arrow::datatypes::{DataType, Field}; +/// # use arrow::buffer::Buffer; +/// // Construct a value array +/// let value_data = ArrayData::builder(DataType::Int32) +/// .len(9) +/// .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) +/// .build() +/// .unwrap(); +/// let list_data_type = DataType::FixedSizeList( +/// Box::new(Field::new("item", DataType::Int32, false)), +/// 3, +/// ); +/// let list_data = ArrayData::builder(list_data_type.clone()) +/// .len(3) +/// .add_child_data(value_data.clone()) +/// .build() +/// .unwrap(); +/// let list_array = FixedSizeListArray::from(list_data); +/// let list0 = list_array.value(0); +/// let list1 = list_array.value(1); +/// let list2 = list_array.value(2); +/// +/// assert_eq!( &[0, 1, 2], list0.as_any().downcast_ref::().unwrap().values()); +/// assert_eq!( &[3, 4, 5], list1.as_any().downcast_ref::().unwrap().values()); +/// assert_eq!( &[6, 7, 8], list2.as_any().downcast_ref::().unwrap().values()); +/// ``` +/// +/// For non generic lists, you may wish to consider using +/// [crate::array::FixedSizeBinaryArray] +pub struct FixedSizeListArray { + data: ArrayData, + values: ArrayRef, + length: i32, +} + +impl FixedSizeListArray { + /// Returns a reference to the values of this list. + pub fn values(&self) -> ArrayRef { + self.values.clone() + } + + /// Returns a clone of the value type of this list. + pub fn value_type(&self) -> DataType { + self.values.data_ref().data_type().clone() + } + + /// Returns ith value of this list array. + pub fn value(&self, i: usize) -> ArrayRef { + self.values + .slice(self.value_offset(i) as usize, self.value_length() as usize) + } + + /// Returns the offset for value at index `i`. + /// + /// Note this doesn't do any bound checking, for performance reason. + #[inline] + pub fn value_offset(&self, i: usize) -> i32 { + self.value_offset_at(self.data.offset() + i) + } + + /// Returns the length for an element. + /// + /// All elements have the same length as the array is a fixed size. + #[inline] + pub const fn value_length(&self) -> i32 { + self.length + } + + #[inline] + const fn value_offset_at(&self, i: usize) -> i32 { + i as i32 * self.length + } +} + +impl From for FixedSizeListArray { + fn from(data: ArrayData) -> Self { + assert_eq!( + data.buffers().len(), + 0, + "FixedSizeListArray data should not contain a buffer for value offsets" + ); + assert_eq!( + data.child_data().len(), + 1, + "FixedSizeListArray should contain a single child array (values array)" + ); + let values = make_array(data.child_data()[0].clone()); + let length = match data.data_type() { + DataType::FixedSizeList(_, len) => { + if *len > 0 { + // check that child data is multiple of length + assert_eq!( + values.len() % *len as usize, + 0, + "FixedSizeListArray child array length should be a multiple of {}", + len + ); + } + + *len + } + _ => { + panic!("FixedSizeListArray data should contain a FixedSizeList data type") + } + }; + Self { + data, + values, + length, + } + } +} + +impl From for ArrayData { + fn from(array: FixedSizeListArray) -> Self { + array.data + } +} + +impl Array for FixedSizeListArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn data(&self) -> &ArrayData { + &self.data + } + + fn into_data(self) -> ArrayData { + self.into() + } +} + +impl ArrayAccessor for FixedSizeListArray { + type Item = ArrayRef; + + fn value(&self, index: usize) -> Self::Item { + FixedSizeListArray::value(self, index) + } + + unsafe fn value_unchecked(&self, index: usize) -> Self::Item { + FixedSizeListArray::value(self, index) + } +} + +impl fmt::Debug for FixedSizeListArray { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "FixedSizeListArray<{}>\n[\n", self.value_length())?; + print_long_array(self, f, |array, index, f| { + fmt::Debug::fmt(&array.value(index), f) + })?; + write!(f, "]") + } +} + +#[cfg(test)] +mod tests { + use crate::{ + array::ArrayData, array::Int32Array, buffer::Buffer, datatypes::Field, + util::bit_util, + }; + + use super::*; + + #[test] + fn test_fixed_size_list_array() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(9) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) + .build() + .unwrap(); + + // Construct a list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 3, + ); + let list_data = ArrayData::builder(list_data_type.clone()) + .len(3) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + + let values = list_array.values(); + assert_eq!(&value_data, values.data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!(6, list_array.value_offset(2)); + assert_eq!(3, list_array.value_length()); + assert_eq!( + 0, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + for i in 0..3 { + assert!(list_array.is_valid(i)); + assert!(!list_array.is_null(i)); + } + + // Now test with a non-zero offset + let list_data = ArrayData::builder(list_data_type) + .len(3) + .offset(1) + .add_child_data(value_data.clone()) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + + let values = list_array.values(); + assert_eq!(&value_data, values.data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(3, list_array.len()); + assert_eq!(0, list_array.null_count()); + assert_eq!( + 3, + list_array + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + ); + assert_eq!(6, list_array.value_offset(1)); + assert_eq!(3, list_array.value_length()); + } + + #[test] + #[should_panic( + expected = "FixedSizeListArray child array length should be a multiple of 3" + )] + // Different error messages, so skip for now + // https://github.com/apache/arrow-rs/issues/1545 + #[cfg(not(feature = "force_validate"))] + fn test_fixed_size_list_array_unequal_children() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(8) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) + .build() + .unwrap(); + + // Construct a list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 3, + ); + let list_data = unsafe { + ArrayData::builder(list_data_type) + .len(3) + .add_child_data(value_data) + .build_unchecked() + }; + drop(FixedSizeListArray::from(list_data)); + } + + #[test] + fn test_fixed_size_list_array_slice() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Set null buts for the nested array: + // [[0, 1], null, null, [6, 7], [8, 9]] + // 01011001 00000001 + let mut null_bits: [u8; 1] = [0; 1]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + + // Construct a fixed size list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 2, + ); + let list_data = ArrayData::builder(list_data_type) + .len(5) + .add_child_data(value_data.clone()) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + + let values = list_array.values(); + assert_eq!(&value_data, values.data()); + assert_eq!(DataType::Int32, list_array.value_type()); + assert_eq!(5, list_array.len()); + assert_eq!(2, list_array.null_count()); + assert_eq!(6, list_array.value_offset(3)); + assert_eq!(2, list_array.value_length()); + + let sliced_array = list_array.slice(1, 4); + assert_eq!(4, sliced_array.len()); + assert_eq!(1, sliced_array.offset()); + assert_eq!(2, sliced_array.null_count()); + + for i in 0..sliced_array.len() { + if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { + assert!(sliced_array.is_valid(i)); + } else { + assert!(sliced_array.is_null(i)); + } + } + + // Check offset and length for each non-null value. + let sliced_list_array = sliced_array + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(2, sliced_list_array.value_length()); + assert_eq!(6, sliced_list_array.value_offset(2)); + assert_eq!(8, sliced_list_array.value_offset(3)); + } + + #[test] + #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] + fn test_fixed_size_list_array_index_out_of_bound() { + // Construct a value array + let value_data = ArrayData::builder(DataType::Int32) + .len(10) + .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) + .build() + .unwrap(); + + // Set null buts for the nested array: + // [[0, 1], null, null, [6, 7], [8, 9]] + // 01011001 00000001 + let mut null_bits: [u8; 1] = [0; 1]; + bit_util::set_bit(&mut null_bits, 0); + bit_util::set_bit(&mut null_bits, 3); + bit_util::set_bit(&mut null_bits, 4); + + // Construct a fixed size list array from the above two + let list_data_type = DataType::FixedSizeList( + Box::new(Field::new("item", DataType::Int32, false)), + 2, + ); + let list_data = ArrayData::builder(list_data_type) + .len(5) + .add_child_data(value_data) + .null_bit_buffer(Some(Buffer::from(null_bits))) + .build() + .unwrap(); + let list_array = FixedSizeListArray::from(list_data); + + list_array.value(10); + } +} diff --git a/arrow/src/array/array_list.rs b/arrow/src/array/array_list.rs index 22aa81ba783c..b9c05014c3f7 100644 --- a/arrow/src/array/array_list.rs +++ b/arrow/src/array/array_list.rs @@ -34,14 +34,17 @@ use crate::{ /// trait declaring an offset size, relevant for i32 vs i64 array types. pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer { const IS_LARGE: bool; + const PREFIX: &'static str; } impl OffsetSizeTrait for i32 { const IS_LARGE: bool = false; + const PREFIX: &'static str = ""; } impl OffsetSizeTrait for i64 { const IS_LARGE: bool = true; + const PREFIX: &'static str = "Large"; } /// Generic struct for a variable-size list array. @@ -57,6 +60,16 @@ pub struct GenericListArray { } impl GenericListArray { + /// The data type constructor of list array. + /// The input is the schema of the child array and + /// the output is the [`DataType`], List or LargeList. + pub const DATA_TYPE_CONSTRUCTOR: fn(Box) -> DataType = if OffsetSize::IS_LARGE + { + DataType::LargeList + } else { + DataType::List + }; + /// Returns a reference to the values of this list. pub fn values(&self) -> ArrayRef { self.values.clone() @@ -170,11 +183,7 @@ impl GenericListArray { .collect(); let field = Box::new(Field::new("item", T::DATA_TYPE, true)); - let data_type = if OffsetSize::IS_LARGE { - DataType::LargeList(field) - } else { - DataType::List(field) - }; + let data_type = Self::DATA_TYPE_CONSTRUCTOR(field); let array_data = ArrayData::builder(data_type) .len(null_buf.len()) .add_buffer(offsets.into()) @@ -274,7 +283,7 @@ impl<'a, OffsetSize: OffsetSizeTrait> ArrayAccessor for &'a GenericListArray fmt::Debug for GenericListArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" }; + let prefix = OffsetSize::PREFIX; write!(f, "{}ListArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { @@ -339,168 +348,6 @@ pub type ListArray = GenericListArray; /// ``` pub type LargeListArray = GenericListArray; -/// A list array where each element is a fixed-size sequence of values with the same -/// type whose maximum length is represented by a i32. -/// -/// # Example -/// -/// ``` -/// # use arrow::array::{Array, ArrayData, FixedSizeListArray, Int32Array}; -/// # use arrow::datatypes::{DataType, Field}; -/// # use arrow::buffer::Buffer; -/// // Construct a value array -/// let value_data = ArrayData::builder(DataType::Int32) -/// .len(9) -/// .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) -/// .build() -/// .unwrap(); -/// let list_data_type = DataType::FixedSizeList( -/// Box::new(Field::new("item", DataType::Int32, false)), -/// 3, -/// ); -/// let list_data = ArrayData::builder(list_data_type.clone()) -/// .len(3) -/// .add_child_data(value_data.clone()) -/// .build() -/// .unwrap(); -/// let list_array = FixedSizeListArray::from(list_data); -/// let list0 = list_array.value(0); -/// let list1 = list_array.value(1); -/// let list2 = list_array.value(2); -/// -/// assert_eq!( &[0, 1, 2], list0.as_any().downcast_ref::().unwrap().values()); -/// assert_eq!( &[3, 4, 5], list1.as_any().downcast_ref::().unwrap().values()); -/// assert_eq!( &[6, 7, 8], list2.as_any().downcast_ref::().unwrap().values()); -/// ``` -/// -/// For non generic lists, you may wish to consider using -/// [crate::array::FixedSizeBinaryArray] -pub struct FixedSizeListArray { - data: ArrayData, - values: ArrayRef, - length: i32, -} - -impl FixedSizeListArray { - /// Returns a reference to the values of this list. - pub fn values(&self) -> ArrayRef { - self.values.clone() - } - - /// Returns a clone of the value type of this list. - pub fn value_type(&self) -> DataType { - self.values.data_ref().data_type().clone() - } - - /// Returns ith value of this list array. - pub fn value(&self, i: usize) -> ArrayRef { - self.values - .slice(self.value_offset(i) as usize, self.value_length() as usize) - } - - /// Returns the offset for value at index `i`. - /// - /// Note this doesn't do any bound checking, for performance reason. - #[inline] - pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) - } - - /// Returns the length for an element. - /// - /// All elements have the same length as the array is a fixed size. - #[inline] - pub const fn value_length(&self) -> i32 { - self.length - } - - #[inline] - const fn value_offset_at(&self, i: usize) -> i32 { - i as i32 * self.length - } -} - -impl From for FixedSizeListArray { - fn from(data: ArrayData) -> Self { - assert_eq!( - data.buffers().len(), - 0, - "FixedSizeListArray data should not contain a buffer for value offsets" - ); - assert_eq!( - data.child_data().len(), - 1, - "FixedSizeListArray should contain a single child array (values array)" - ); - let values = make_array(data.child_data()[0].clone()); - let length = match data.data_type() { - DataType::FixedSizeList(_, len) => { - if *len > 0 { - // check that child data is multiple of length - assert_eq!( - values.len() % *len as usize, - 0, - "FixedSizeListArray child array length should be a multiple of {}", - len - ); - } - - *len - } - _ => { - panic!("FixedSizeListArray data should contain a FixedSizeList data type") - } - }; - Self { - data, - values, - length, - } - } -} - -impl From for ArrayData { - fn from(array: FixedSizeListArray) -> Self { - array.data - } -} - -impl Array for FixedSizeListArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn data(&self) -> &ArrayData { - &self.data - } - - fn into_data(self) -> ArrayData { - self.into() - } -} - -impl ArrayAccessor for FixedSizeListArray { - type Item = ArrayRef; - - fn value(&self, index: usize) -> Self::Item { - FixedSizeListArray::value(self, index) - } - - unsafe fn value_unchecked(&self, index: usize) -> Self::Item { - FixedSizeListArray::value(self, index) - } -} - -impl fmt::Debug for FixedSizeListArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "FixedSizeListArray<{}>\n[\n", self.value_length())?; - print_long_array(self, f, |array, index, f| { - fmt::Debug::fmt(&array.value(index), f) - })?; - write!(f, "]") - } -} - #[cfg(test)] mod tests { use crate::{ @@ -758,104 +605,6 @@ mod tests { ); } - #[test] - fn test_fixed_size_list_array() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(9) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8])) - .build() - .unwrap(); - - // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 3, - ); - let list_data = ArrayData::builder(list_data_type.clone()) - .len(3) - .add_child_data(value_data.clone()) - .build() - .unwrap(); - let list_array = FixedSizeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!(6, list_array.value_offset(2)); - assert_eq!(3, list_array.value_length()); - assert_eq!( - 0, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - for i in 0..3 { - assert!(list_array.is_valid(i)); - assert!(!list_array.is_null(i)); - } - - // Now test with a non-zero offset - let list_data = ArrayData::builder(list_data_type) - .len(3) - .offset(1) - .add_child_data(value_data.clone()) - .build() - .unwrap(); - let list_array = FixedSizeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(3, list_array.len()); - assert_eq!(0, list_array.null_count()); - assert_eq!( - 3, - list_array - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .value(0) - ); - assert_eq!(6, list_array.value_offset(1)); - assert_eq!(3, list_array.value_length()); - } - - #[test] - #[should_panic( - expected = "FixedSizeListArray child array length should be a multiple of 3" - )] - // Different error messages, so skip for now - // https://github.com/apache/arrow-rs/issues/1545 - #[cfg(not(feature = "force_validate"))] - fn test_fixed_size_list_array_unequal_children() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(8) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7])) - .build() - .unwrap(); - - // Construct a list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 3, - ); - let list_data = unsafe { - ArrayData::builder(list_data_type) - .len(3) - .add_child_data(value_data) - .build_unchecked() - }; - drop(FixedSizeListArray::from(list_data)); - } - #[test] fn test_list_array_slice() { // Construct a value array @@ -1022,102 +771,6 @@ mod tests { list_array.value(10); } - - #[test] - fn test_fixed_size_list_array_slice() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build() - .unwrap(); - - // Set null buts for the nested array: - // [[0, 1], null, null, [6, 7], [8, 9]] - // 01011001 00000001 - let mut null_bits: [u8; 1] = [0; 1]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data.clone()) - .null_bit_buffer(Some(Buffer::from(null_bits))) - .build() - .unwrap(); - let list_array = FixedSizeListArray::from(list_data); - - let values = list_array.values(); - assert_eq!(&value_data, values.data()); - assert_eq!(DataType::Int32, list_array.value_type()); - assert_eq!(5, list_array.len()); - assert_eq!(2, list_array.null_count()); - assert_eq!(6, list_array.value_offset(3)); - assert_eq!(2, list_array.value_length()); - - let sliced_array = list_array.slice(1, 4); - assert_eq!(4, sliced_array.len()); - assert_eq!(1, sliced_array.offset()); - assert_eq!(2, sliced_array.null_count()); - - for i in 0..sliced_array.len() { - if bit_util::get_bit(&null_bits, sliced_array.offset() + i) { - assert!(sliced_array.is_valid(i)); - } else { - assert!(sliced_array.is_null(i)); - } - } - - // Check offset and length for each non-null value. - let sliced_list_array = sliced_array - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(2, sliced_list_array.value_length()); - assert_eq!(6, sliced_list_array.value_offset(2)); - assert_eq!(8, sliced_list_array.value_offset(3)); - } - - #[test] - #[should_panic(expected = "assertion failed: (offset + length) <= self.len()")] - fn test_fixed_size_list_array_index_out_of_bound() { - // Construct a value array - let value_data = ArrayData::builder(DataType::Int32) - .len(10) - .add_buffer(Buffer::from_slice_ref(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) - .build() - .unwrap(); - - // Set null buts for the nested array: - // [[0, 1], null, null, [6, 7], [8, 9]] - // 01011001 00000001 - let mut null_bits: [u8; 1] = [0; 1]; - bit_util::set_bit(&mut null_bits, 0); - bit_util::set_bit(&mut null_bits, 3); - bit_util::set_bit(&mut null_bits, 4); - - // Construct a fixed size list array from the above two - let list_data_type = DataType::FixedSizeList( - Box::new(Field::new("item", DataType::Int32, false)), - 2, - ); - let list_data = ArrayData::builder(list_data_type) - .len(5) - .add_child_data(value_data) - .null_bit_buffer(Some(Buffer::from(null_bits))) - .build() - .unwrap(); - let list_array = FixedSizeListArray::from(list_data); - - list_array.value(10); - } - #[test] #[should_panic( expected = "ListArray data should contain a single buffer only (value offsets)" diff --git a/arrow/src/array/array_primitive.rs b/arrow/src/array/array_primitive.rs index eb731a2b2f1e..a10104d980e1 100644 --- a/arrow/src/array/array_primitive.rs +++ b/arrow/src/array/array_primitive.rs @@ -549,6 +549,18 @@ impl PrimitiveArray { let array_data = unsafe { array_data.build_unchecked() }; PrimitiveArray::from(array_data) } + + /// Construct a timestamp array with new timezone + pub fn with_timezone(&self, timezone: String) -> Self { + let array_data = unsafe { + self.data + .clone() + .into_builder() + .data_type(DataType::Timestamp(T::get_time_unit(), Some(timezone))) + .build_unchecked() + }; + PrimitiveArray::from(array_data) + } } impl PrimitiveArray { @@ -1099,4 +1111,21 @@ mod tests { BooleanArray::from(vec![true, true, true, true, true]) ); } + + #[cfg(feature = "chrono-tz")] + #[test] + fn test_with_timezone() { + use crate::compute::hour; + let a: TimestampMicrosecondArray = vec![37800000000, 86339000000].into(); + + let b = hour(&a).unwrap(); + assert_eq!(10, b.value(0)); + assert_eq!(23, b.value(1)); + + let a = a.with_timezone(String::from("America/Los_Angeles")); + + let b = hour(&a).unwrap(); + assert_eq!(2, b.value(0)); + assert_eq!(15, b.value(1)); + } } diff --git a/arrow/src/array/array_string.rs b/arrow/src/array/array_string.rs index 12a6b2f98b5f..b72152cc4acd 100644 --- a/arrow/src/array/array_string.rs +++ b/arrow/src/array/array_string.rs @@ -39,15 +39,17 @@ pub struct GenericStringArray { } impl GenericStringArray { + /// Data type of the array. + pub const DATA_TYPE: DataType = if OffsetSize::IS_LARGE { + DataType::LargeUtf8 + } else { + DataType::Utf8 + }; + /// Get the data type of the array. - // Declare this function as `pub const fn` after - // https://github.com/rust-lang/rust/issues/93706 is merged. - pub fn get_data_type() -> DataType { - if OffsetSize::IS_LARGE { - DataType::LargeUtf8 - } else { - DataType::Utf8 - } + #[deprecated(note = "please use `Self::DATA_TYPE` instead")] + pub const fn get_data_type() -> DataType { + Self::DATA_TYPE } /// Returns the length for the element at index `i`. @@ -119,23 +121,40 @@ impl GenericStringArray { unsafe { self.value_unchecked(i) } } + /// Convert a list array to a string array. + /// This method is unsound because it does + /// not check the utf-8 validation for each element. fn from_list(v: GenericListArray) -> Self { assert_eq!( - v.data().child_data()[0].child_data().len(), + v.data_ref().child_data().len(), + 1, + "StringArray can only be created from list array of u8 values \ + (i.e. List>)." + ); + let child_data = &v.data_ref().child_data()[0]; + + assert_eq!( + child_data.child_data().len(), 0, "StringArray can only be created from list array of u8 values \ (i.e. List>)." ); assert_eq!( - v.data().child_data()[0].data_type(), + child_data.data_type(), &DataType::UInt8, "StringArray can only be created from List arrays, mismatched data types." ); + assert_eq!( + child_data.null_count(), + 0, + "The child array cannot contain null values." + ); - let builder = ArrayData::builder(Self::get_data_type()) + let builder = ArrayData::builder(Self::DATA_TYPE) .len(v.len()) + .offset(v.offset()) .add_buffer(v.data().buffers()[0].clone()) - .add_buffer(v.data().child_data()[0].buffers()[0].clone()) + .add_buffer(child_data.buffers()[0].slice(child_data.offset())) .null_bit_buffer(v.data().null_buffer().cloned()); let array_data = unsafe { builder.build_unchecked() }; @@ -170,7 +189,7 @@ impl GenericStringArray { assert!(!offsets.is_empty()); // wrote at least one let actual_len = (offsets.len() / std::mem::size_of::()) - 1; - let array_data = ArrayData::builder(Self::get_data_type()) + let array_data = ArrayData::builder(Self::DATA_TYPE) .len(actual_len) .add_buffer(offsets.into()) .add_buffer(values.into()); @@ -247,7 +266,7 @@ where // calculate actual data_len, which may be different from the iterator's upper bound let data_len = (offsets.len() / offset_size) - 1; - let array_data = ArrayData::builder(Self::get_data_type()) + let array_data = ArrayData::builder(Self::DATA_TYPE) .len(data_len) .add_buffer(offsets.into()) .add_buffer(values.into()) @@ -275,7 +294,7 @@ impl<'a, T: OffsetSizeTrait> GenericStringArray { impl fmt::Debug for GenericStringArray { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let prefix = if OffsetSize::IS_LARGE { "Large" } else { "" }; + let prefix = OffsetSize::PREFIX; write!(f, "{}StringArray\n[\n", prefix)?; print_long_array(self, f, |array, index, f| { @@ -325,10 +344,7 @@ impl From> for GenericStringArray { fn from(v: GenericBinaryArray) -> Self { - let builder = v - .into_data() - .into_builder() - .data_type(Self::get_data_type()); + let builder = v.into_data().into_builder().data_type(Self::DATA_TYPE); let data = unsafe { builder.build_unchecked() }; Self::from(data) } @@ -338,7 +354,7 @@ impl From for GenericStringArray Self { assert_eq!( data.data_type(), - &Self::get_data_type(), + &Self::DATA_TYPE, "[Large]StringArray expects Datatype::[Large]Utf8" ); assert_eq!( @@ -409,7 +425,10 @@ pub type LargeStringArray = GenericStringArray; #[cfg(test)] mod tests { - use crate::array::{ListBuilder, StringBuilder}; + use crate::{ + array::{ListBuilder, StringBuilder}, + datatypes::Field, + }; use super::*; @@ -675,4 +694,127 @@ mod tests { LargeStringArray::from_iter_values(BadIterator::new(3, 1, data.clone())); assert_eq!(expected, arr); } + + fn _test_generic_string_array_from_list_array() { + let values = b"HelloArrowAndParquet"; + // "ArrowAndParquet" + let child_data = ArrayData::builder(DataType::UInt8) + .len(15) + .offset(5) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + + let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap()); + let null_buffer = Buffer::from_slice_ref(&[0b101]); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); + + // [None, Some("Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) + .offset(1) + .add_buffer(Buffer::from_slice_ref(&offsets)) + .null_bit_buffer(Some(null_buffer)) + .add_child_data(child_data) + .build() + .unwrap(); + let list_array = GenericListArray::::from(array_data); + let string_array = GenericStringArray::::from(list_array); + + assert_eq!(2, string_array.len()); + assert_eq!(1, string_array.null_count()); + assert!(string_array.is_null(0)); + assert!(string_array.is_valid(1)); + assert_eq!("Parquet", string_array.value(1)); + } + + #[test] + fn test_string_array_from_list_array() { + _test_generic_string_array_from_list_array::(); + } + + #[test] + fn test_large_string_array_from_list_array() { + _test_generic_string_array_from_list_array::(); + } + + fn _test_generic_string_array_from_list_array_with_child_nulls_failed< + O: OffsetSizeTrait, + >() { + let values = b"HelloArrow"; + let child_data = ArrayData::builder(DataType::UInt8) + .len(10) + .add_buffer(Buffer::from(&values[..])) + .null_bit_buffer(Some(Buffer::from_slice_ref(&[0b1010101010]))) + .build() + .unwrap(); + + let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap()); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt8, false), + )); + + // [None, Some(b"Parquet")] + let array_data = ArrayData::builder(data_type) + .len(2) + .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_child_data(child_data) + .build() + .unwrap(); + let list_array = GenericListArray::::from(array_data); + drop(GenericStringArray::::from(list_array)); + } + + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_stirng_array_from_list_array_with_child_nulls_failed() { + _test_generic_string_array_from_list_array_with_child_nulls_failed::(); + } + + #[test] + #[should_panic(expected = "The child array cannot contain null values.")] + fn test_large_string_array_from_list_array_with_child_nulls_failed() { + _test_generic_string_array_from_list_array_with_child_nulls_failed::(); + } + + fn _test_generic_string_array_from_list_array_wrong_type() { + let values = b"HelloArrow"; + let child_data = ArrayData::builder(DataType::UInt16) + .len(5) + .add_buffer(Buffer::from(&values[..])) + .build() + .unwrap(); + + let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap()); + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::UInt16, false), + )); + + let array_data = ArrayData::builder(data_type) + .len(2) + .add_buffer(Buffer::from_slice_ref(&offsets)) + .add_child_data(child_data) + .build() + .unwrap(); + let list_array = GenericListArray::::from(array_data); + drop(GenericStringArray::::from(list_array)); + } + + #[test] + #[should_panic( + expected = "StringArray can only be created from List arrays, mismatched data types." + )] + fn test_string_array_from_list_array_wrong_type() { + _test_generic_string_array_from_list_array_wrong_type::(); + } + + #[test] + #[should_panic( + expected = "StringArray can only be created from List arrays, mismatched data types." + )] + fn test_large_string_array_from_list_array_wrong_type() { + _test_generic_string_array_from_list_array_wrong_type::(); + } } diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs index d015d3dcecda..9a76a31dd523 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow/src/array/builder/decimal_builder.rs @@ -15,19 +15,20 @@ // specific language governing permissions and limitations // under the License. -use num::BigInt; use std::any::Any; use std::sync::Arc; -use crate::array::array_decimal::{BasicDecimalArray, Decimal256Array}; +use crate::array::array_decimal::Decimal256Array; use crate::array::ArrayRef; use crate::array::Decimal128Array; use crate::array::{ArrayBuilder, FixedSizeBinaryBuilder}; use crate::error::{ArrowError, Result}; -use crate::datatypes::{validate_decimal256_precision, validate_decimal_precision}; -use crate::util::decimal::{BasicDecimal, Decimal256}; +use crate::datatypes::{ + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, +}; +use crate::util::decimal::Decimal256; /// Array Builder for [`Decimal128Array`] /// @@ -84,39 +85,32 @@ impl Decimal128Builder { /// Appends a decimal value into the builder. #[inline] pub fn append_value(&mut self, value: impl Into) -> Result<()> { - let value = if self.value_validation { - validate_decimal_precision(value.into(), self.precision)? - } else { - value.into() - }; - - let value_as_bytes = - Self::from_i128_to_fixed_size_bytes(value, Self::BYTE_LENGTH as usize)?; - if Self::BYTE_LENGTH != value_as_bytes.len() as i32 { - return Err(ArrowError::InvalidArgumentError( - "Byte slice does not have the same length as Decimal128Builder value lengths".to_string() - )); + let value = value.into(); + if self.value_validation { + validate_decimal_precision(value, self.precision)? } + let value_as_bytes: [u8; 16] = value.to_le_bytes(); self.builder.append_value(value_as_bytes.as_slice()) } - pub(crate) fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Result> { - if size > 16 { - return Err(ArrowError::InvalidArgumentError( - "Decimal128Builder only supports values up to 16 bytes.".to_string(), - )); - } - let res = v.to_le_bytes(); - let start_byte = 16 - size; - Ok(res[start_byte..16].to_vec()) - } - /// Append a null value to the array. #[inline] pub fn append_null(&mut self) { self.builder.append_null() } + /// Appends an `Option>` into the builder. + #[inline] + pub fn append_option(&mut self, value: Option>) -> Result<()> { + match value { + None => { + self.append_null(); + Ok(()) + } + Some(value) => self.append_value(value), + } + } + /// Builds the `Decimal128Array` and reset this builder. pub fn finish(&mut self) -> Decimal128Array { Decimal128Array::from_fixed_size_binary_array( @@ -189,9 +183,7 @@ impl Decimal256Builder { pub fn append_value(&mut self, value: &Decimal256) -> Result<()> { let value = if self.value_validation { let raw_bytes = value.raw_value(); - let integer = BigInt::from_signed_bytes_le(raw_bytes); - let value_str = integer.to_string(); - validate_decimal256_precision(&value_str, self.precision)?; + validate_decimal256_precision_with_lt_bytes(raw_bytes, self.precision)?; value } else { value @@ -219,6 +211,18 @@ impl Decimal256Builder { self.builder.append_null() } + /// Appends an `Option<&Decimal256>` into the builder. + #[inline] + pub fn append_option(&mut self, value: Option<&Decimal256>) -> Result<()> { + match value { + None => { + self.append_null(); + Ok(()) + } + Some(value) => self.append_value(value), + } + } + /// Builds the [`Decimal256Array`] and reset this builder. pub fn finish(&mut self) -> Decimal256Array { Decimal256Array::from_fixed_size_binary_array( @@ -232,12 +236,12 @@ impl Decimal256Builder { #[cfg(test)] mod tests { use super::*; - use num::Num; + use num::{BigInt, Num}; - use crate::array::array_decimal::{BasicDecimalArray, Decimal128Array}; + use crate::array::array_decimal::Decimal128Array; use crate::array::{array_decimal, Array}; use crate::datatypes::DataType; - use crate::util::decimal::Decimal128; + use crate::util::decimal::{Decimal128, Decimal256}; #[test] fn test_decimal_builder() { @@ -246,11 +250,13 @@ mod tests { builder.append_value(8_887_000_000_i128).unwrap(); builder.append_null(); builder.append_value(-8_887_000_000_i128).unwrap(); + builder.append_option(None::).unwrap(); + builder.append_option(Some(8_887_000_000_i128)).unwrap(); let decimal_array: Decimal128Array = builder.finish(); - assert_eq!(&DataType::Decimal(38, 6), decimal_array.data_type()); - assert_eq!(3, decimal_array.len()); - assert_eq!(1, decimal_array.null_count()); + assert_eq!(&DataType::Decimal128(38, 6), decimal_array.data_type()); + assert_eq!(5, decimal_array.len()); + assert_eq!(2, decimal_array.null_count()); assert_eq!(32, decimal_array.value_offset(2)); assert_eq!(16, decimal_array.value_length()); } @@ -268,7 +274,7 @@ mod tests { .unwrap(); let decimal_array: Decimal128Array = builder.finish(); - assert_eq!(&DataType::Decimal(38, 6), decimal_array.data_type()); + assert_eq!(&DataType::Decimal128(38, 6), decimal_array.data_type()); assert_eq!(3, decimal_array.len()); assert_eq!(1, decimal_array.null_count()); assert_eq!(32, decimal_array.value_offset(2)); @@ -279,28 +285,31 @@ mod tests { fn test_decimal256_builder() { let mut builder = Decimal256Builder::new(30, 40, 6); - let mut bytes = vec![0; 32]; + let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); builder.append_null(); - bytes = vec![255; 32]; - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + bytes = [255; 32]; + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); - bytes = vec![0; 32]; + bytes = [0; 32]; bytes[0..16].clone_from_slice(&0_i128.to_le_bytes()); bytes[15] = 128; - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); + builder.append_option(None::<&Decimal256>).unwrap(); + builder.append_option(Some(&value)).unwrap(); + let decimal_array: Decimal256Array = builder.finish(); assert_eq!(&DataType::Decimal256(40, 6), decimal_array.data_type()); - assert_eq!(4, decimal_array.len()); - assert_eq!(1, decimal_array.null_count()); + assert_eq!(6, decimal_array.len()); + assert_eq!(2, decimal_array.null_count()); assert_eq!(64, decimal_array.value_offset(2)); assert_eq!(32, decimal_array.value_length()); @@ -320,39 +329,37 @@ mod tests { fn test_decimal256_builder_unmatched_precision_scale() { let mut builder = Decimal256Builder::new(30, 10, 6); - let mut bytes = vec![0; 32]; + let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(40, 6, &bytes).unwrap(); builder.append_value(&value).unwrap(); } #[test] #[should_panic( - expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 76. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" + expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 75. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" )] fn test_decimal256_builder_out_of_range_precision_scale() { - let mut builder = Decimal256Builder::new(30, 76, 6); + let mut builder = Decimal256Builder::new(30, 75, 6); let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap(); - let bytes = big_value.to_signed_bytes_le(); - let value = Decimal256::try_new_from_bytes(76, 6, &bytes).unwrap(); + let value = Decimal256::from_big_int(&big_value, 75, 6).unwrap(); builder.append_value(&value).unwrap(); } #[test] #[should_panic( - expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 76. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" + expected = "9999999999999999999999999999999999999999999999999999999999999999999999999999 is too large to store in a Decimal256 of precision 75. Max is 999999999999999999999999999999999999999999999999999999999999999999999999999" )] fn test_decimal256_data_validation() { - let mut builder = Decimal256Builder::new(30, 76, 6); + let mut builder = Decimal256Builder::new(30, 75, 6); // Disable validation at builder unsafe { builder.disable_value_validation(); } let big_value = BigInt::from_str_radix("9999999999999999999999999999999999999999999999999999999999999999999999999999", 10).unwrap(); - let bytes = big_value.to_signed_bytes_le(); - let value = Decimal256::try_new_from_bytes(76, 6, &bytes).unwrap(); + let value = Decimal256::from_big_int(&big_value, 75, 6).unwrap(); builder .append_value(&value) .expect("should not validate invalid value at builder"); diff --git a/arrow/src/array/builder/generic_binary_builder.rs b/arrow/src/array/builder/generic_binary_builder.rs index 8f242243cd7c..aca2e1d9694e 100644 --- a/arrow/src/array/builder/generic_binary_builder.rs +++ b/arrow/src/array/builder/generic_binary_builder.rs @@ -15,12 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::{ - array::{ - ArrayBuilder, ArrayDataBuilder, ArrayRef, GenericBinaryArray, OffsetSizeTrait, - UInt8BufferBuilder, - }, - datatypes::DataType, +use crate::array::{ + ArrayBuilder, ArrayDataBuilder, ArrayRef, GenericBinaryArray, OffsetSizeTrait, + UInt8BufferBuilder, }; use std::any::Any; use std::sync::Arc; @@ -80,11 +77,7 @@ impl GenericBinaryBuilder { /// Builds the [`GenericBinaryArray`] and reset this builder. pub fn finish(&mut self) -> GenericBinaryArray { - let array_type = if OffsetSize::IS_LARGE { - DataType::LargeBinary - } else { - DataType::Binary - }; + let array_type = GenericBinaryArray::::DATA_TYPE; let array_builder = ArrayDataBuilder::new(array_type) .len(self.len()) .add_buffer(self.offsets_builder.finish()) diff --git a/arrow/src/array/builder/generic_list_builder.rs b/arrow/src/array/builder/generic_list_builder.rs index 911182f6571d..686156df13bc 100644 --- a/arrow/src/array/builder/generic_list_builder.rs +++ b/arrow/src/array/builder/generic_list_builder.rs @@ -22,7 +22,6 @@ use crate::array::ArrayData; use crate::array::ArrayRef; use crate::array::GenericListArray; use crate::array::OffsetSizeTrait; -use crate::datatypes::DataType; use crate::datatypes::Field; use super::{ArrayBuilder, BufferBuilder, NullBufferBuilder}; @@ -135,11 +134,7 @@ where values_data.data_type().clone(), true, // TODO: find a consistent way of getting this )); - let data_type = if OffsetSize::IS_LARGE { - DataType::LargeList(field) - } else { - DataType::List(field) - }; + let data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(field); let array_data_builder = ArrayData::builder(data_type) .len(len) .add_buffer(offset_buffer) @@ -163,6 +158,7 @@ mod tests { use crate::array::builder::ListBuilder; use crate::array::{Array, Int32Array, Int32Builder}; use crate::buffer::Buffer; + use crate::datatypes::DataType; fn _test_generic_list_array_builder() { let values_builder = Int32Builder::new(10); diff --git a/arrow/src/array/builder/string_dictionary_builder.rs b/arrow/src/array/builder/string_dictionary_builder.rs index cfbda38c0b28..15a36a64c14e 100644 --- a/arrow/src/array/builder/string_dictionary_builder.rs +++ b/arrow/src/array/builder/string_dictionary_builder.rs @@ -137,7 +137,7 @@ where for (idx, maybe_value) in dictionary_values.iter().enumerate() { match maybe_value { Some(value) => { - let hash = compute_hash(&state, value.as_bytes()); + let hash = state.hash_one(value.as_bytes()); let key = K::Native::from_usize(idx) .ok_or(ArrowError::DictionaryKeyOverflowError)?; @@ -149,7 +149,7 @@ where if let RawEntryMut::Vacant(v) = entry { v.insert_with_hasher(hash, key, (), |key| { - compute_hash(&state, get_bytes(&values_builder, key)) + state.hash_one(get_bytes(&values_builder, key)) }); } @@ -217,7 +217,7 @@ where let state = &self.state; let storage = &mut self.values_builder; - let hash = compute_hash(state, value.as_bytes()); + let hash = state.hash_one(value.as_bytes()); let entry = self .dedup @@ -234,7 +234,7 @@ where *entry .insert_with_hasher(hash, key, (), |key| { - compute_hash(state, get_bytes(storage, key)) + state.hash_one(get_bytes(storage, key)) }) .0 } @@ -268,13 +268,6 @@ where } } -fn compute_hash(hasher: &ahash::RandomState, value: &[u8]) -> u64 { - use std::hash::{BuildHasher, Hash, Hasher}; - let mut state = hasher.build_hasher(); - value.hash(&mut state); - state.finish() -} - fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a [u8] { let offsets = values.offsets_slice(); let values = values.values_slice(); diff --git a/arrow/src/array/builder/struct_builder.rs b/arrow/src/array/builder/struct_builder.rs index 373a84582831..554e3c553db5 100644 --- a/arrow/src/array/builder/struct_builder.rs +++ b/arrow/src/array/builder/struct_builder.rs @@ -112,7 +112,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box { Box::new(FixedSizeBinaryBuilder::new(capacity, *len)) } - DataType::Decimal(precision, scale) => { + DataType::Decimal128(precision, scale) => { Box::new(Decimal128Builder::new(capacity, *precision, *scale)) } DataType::Utf8 => Box::new(StringBuilder::new(capacity)), diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 17b51cd3b7bd..3993d51d9b87 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -19,8 +19,8 @@ //! common attributes and operations for Arrow array. use crate::datatypes::{ - validate_decimal256_precision, validate_decimal_precision, DataType, IntervalUnit, - UnionMode, + validate_decimal256_precision_with_lt_bytes, validate_decimal_precision, DataType, + IntervalUnit, UnionMode, }; use crate::error::{ArrowError, Result}; use crate::util::bit_iterator::BitSliceIterator; @@ -30,7 +30,6 @@ use crate::{ util::bit_util, }; use half::f16; -use num::BigInt; use std::convert::TryInto; use std::mem; use std::ops::Range; @@ -209,7 +208,7 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuff DataType::FixedSizeList(_, _) | DataType::Struct(_) => { [empty_buffer, MutableBuffer::new(0)] } - DataType::Decimal(_, _) | DataType::Decimal256(_, _) => [ + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ MutableBuffer::new(capacity * mem::size_of::()), empty_buffer, ], @@ -396,18 +395,24 @@ impl ArrayData { /// panic's if the new DataType is not compatible with the /// existing type. /// - /// Note: currently only changing a [DataType::Decimal]s precision - /// and scale are supported + /// Note: currently only changing a [DataType::Decimal128]s or + /// [DataType::Decimal256]s precision and scale are supported #[inline] pub(crate) fn with_data_type(mut self, new_data_type: DataType) -> Self { - assert!( - matches!(self.data_type, DataType::Decimal(_, _)), - "only DecimalType is supported for existing type" - ); - assert!( - matches!(new_data_type, DataType::Decimal(_, _)), - "only DecimalType is supported for new datatype" - ); + if matches!(self.data_type, DataType::Decimal128(_, _)) { + assert!( + matches!(new_data_type, DataType::Decimal128(_, _)), + "only 128-bit DecimalType is supported for new datatype" + ); + } else if matches!(self.data_type, DataType::Decimal256(_, _)) { + assert!( + matches!(new_data_type, DataType::Decimal256(_, _)), + "only 256-bit DecimalType is supported for new datatype" + ); + } else { + panic!("only DecimalType is supported.") + } + self.data_type = new_data_type; self } @@ -598,7 +603,7 @@ impl ArrayData { | DataType::LargeBinary | DataType::Interval(_) | DataType::FixedSizeBinary(_) - | DataType::Decimal(_, _) + | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => vec![], DataType::List(field) => { vec![Self::new_empty(field.data_type())] @@ -1031,7 +1036,7 @@ impl ArrayData { pub fn validate_values(&self) -> Result<()> { match &self.data_type { - DataType::Decimal(p, _) => { + DataType::Decimal128(p, _) => { let values_buffer: &[i128] = self.typed_buffer(0, self.len)?; for value in values_buffer { validate_decimal_precision(*value, *p)?; @@ -1043,9 +1048,7 @@ impl ArrayData { for pos in 0..self.len() { let offset = pos * 32; let raw_bytes = &values[offset..offset + 32]; - let integer = BigInt::from_signed_bytes_le(raw_bytes); - let value_str = integer.to_string(); - validate_decimal256_precision(&value_str, *p)?; + validate_decimal256_precision_with_lt_bytes(raw_bytes, *p)?; } Ok(()) } @@ -1361,7 +1364,7 @@ pub(crate) fn layout(data_type: &DataType) -> DataTypeLayout { } } DataType::Dictionary(key_type, _value_type) => layout(key_type), - DataType::Decimal(_, _) => { + DataType::Decimal128(_, _) => { // Decimals are always some fixed width; The rust implementation // always uses 16 bytes / size of i128 DataTypeLayout::new_fixed_width(size_of::()) @@ -2822,11 +2825,7 @@ mod tests { let byte_width = 16; let mut fixed_size_builder = FixedSizeListBuilder::new(values_builder, byte_width); - let value_as_bytes = Decimal128Builder::from_i128_to_fixed_size_bytes( - 123456, - fixed_size_builder.value_length() as usize, - ) - .unwrap(); + let value_as_bytes = 123456_i128.to_le_bytes(); fixed_size_builder .values() .append_slice(value_as_bytes.as_slice()); @@ -2834,14 +2833,14 @@ mod tests { let fixed_size_array = fixed_size_builder.finish(); // Build ArrayData for Decimal - let builder = ArrayData::builder(DataType::Decimal(5, 3)) + let builder = ArrayData::builder(DataType::Decimal128(5, 3)) .len(fixed_size_array.len()) .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); let array_data = unsafe { builder.build_unchecked() }; let validation_result = array_data.validate_full(); let error = validation_result.unwrap_err(); assert_eq!( - "Invalid argument error: 123456 is too large to store in a Decimal of precision 5. Max is 99999", + "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", error.to_string() ); } diff --git a/arrow/src/array/equal/decimal.rs b/arrow/src/array/equal/decimal.rs index 7c44037be398..42a7d29e27d2 100644 --- a/arrow/src/array/equal/decimal.rs +++ b/arrow/src/array/equal/decimal.rs @@ -29,7 +29,7 @@ pub(super) fn decimal_equal( len: usize, ) -> bool { let size = match lhs.data_type() { - DataType::Decimal(_, _) => 16, + DataType::Decimal128(_, _) => 16, DataType::Decimal256(_, _) => 32, _ => unreachable!(), }; diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs index 270147eaeec3..6fdc06f837c0 100644 --- a/arrow/src/array/equal/mod.rs +++ b/arrow/src/array/equal/mod.rs @@ -187,7 +187,7 @@ fn equal_values( DataType::FixedSizeBinary(_) => { fixed_binary_equal(lhs, rhs, lhs_start, rhs_start, len) } - DataType::Decimal(_, _) | DataType::Decimal256(_, _) => { + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { decimal_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow/src/array/equal_json.rs b/arrow/src/array/equal_json.rs deleted file mode 100644 index e7d14aae81a8..000000000000 --- a/arrow/src/array/equal_json.rs +++ /dev/null @@ -1,1170 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use super::*; -use crate::array::BasicDecimalArray; -use crate::datatypes::*; -use crate::util::decimal::BasicDecimal; -use array::Array; -use hex::FromHex; -use serde_json::value::Value::{Null as JNull, Object, String as JString}; -use serde_json::Value; - -/// Trait for comparing arrow array with json array -pub trait JsonEqual { - /// Checks whether arrow array equals to json array. - fn equals_json(&self, json: &[&Value]) -> bool; - - /// Checks whether arrow array equals to json array. - fn equals_json_values(&self, json: &[Value]) -> bool { - let refs = json.iter().collect::>(); - - self.equals_json(&refs) - } -} - -impl<'a, T: JsonEqual> JsonEqual for &'a T { - fn equals_json(&self, json: &[&Value]) -> bool { - T::equals_json(self, json) - } - - fn equals_json_values(&self, json: &[Value]) -> bool { - T::equals_json_values(self, json) - } -} - -/// Implement array equals for numeric type -impl JsonEqual for PrimitiveArray { - fn equals_json(&self, json: &[&Value]) -> bool { - self.len() == json.len() - && (0..self.len()).all(|i| match json[i] { - Value::Null => self.is_null(i), - v => { - self.is_valid(i) - && Some(v) == self.value(i).into_json_value().as_ref() - } - }) - } -} - -/// Implement array equals for numeric type -impl JsonEqual for BooleanArray { - fn equals_json(&self, json: &[&Value]) -> bool { - self.len() == json.len() - && (0..self.len()).all(|i| match json[i] { - Value::Null => self.is_null(i), - v => { - self.is_valid(i) - && Some(v) == self.value(i).into_json_value().as_ref() - } - }) - } -} - -impl PartialEq for PrimitiveArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(array) => self.equals_json_values(array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &PrimitiveArray) -> bool { - match self { - Value::Array(array) => arrow.equals_json_values(array), - _ => false, - } - } -} - -impl JsonEqual for GenericListArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length(i).is_zero(), - _ => false, - }) - } -} - -impl PartialEq for GenericListArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &GenericListArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for DictionaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - // todo: this is wrong: we must test the values also - self.keys().equals_json(json) - } -} - -impl PartialEq for DictionaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &DictionaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for FixedSizeListArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length() == 0, - _ => false, - }) - } -} - -impl PartialEq for FixedSizeListArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &FixedSizeListArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for StructArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - let all_object = json.iter().all(|v| matches!(v, Object(_) | JNull)); - - if !all_object { - return false; - } - - for column_name in self.column_names() { - let json_values = json - .iter() - .map(|obj| obj.get(column_name).unwrap_or(&Value::Null)) - .collect::>(); - - if !self - .column_by_name(column_name) - .map(|arr| arr.equals_json(&json_values)) - .unwrap_or(false) - { - return false; - } - } - - true - } -} - -impl PartialEq for StructArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &StructArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for MapArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - Value::Array(v) => self.is_valid(i) && self.value(i).equals_json_values(v), - Value::Null => self.is_null(i) || self.value_length(i).eq(&0), - _ => false, - }) - } -} - -impl PartialEq for MapArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &MapArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for GenericBinaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - // binary data is sometimes hex encoded, this checks if bytes are equal, - // and if not converting to hex is attempted - self.is_valid(i) - && (s.as_str().as_bytes() == self.value(i) - || Vec::from_hex(s.as_str()) == Ok(self.value(i).to_vec())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for GenericBinaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &GenericBinaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for GenericStringArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => self.is_valid(i) && s.as_str() == self.value(i), - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for GenericStringArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq> for Value { - fn eq(&self, arrow: &GenericStringArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for FixedSizeBinaryArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - // binary data is sometimes hex encoded, this checks if bytes are equal, - // and if not converting to hex is attempted - self.is_valid(i) - && (s.as_str().as_bytes() == self.value(i) - || Vec::from_hex(s.as_str()) == Ok(self.value(i).to_vec())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for FixedSizeBinaryArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &FixedSizeBinaryArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for Decimal128Array { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => { - self.is_valid(i) - && (s - .parse::() - .map_or_else(|_| false, |v| v == self.value(i).as_i128())) - } - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl JsonEqual for Decimal256Array { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - (0..self.len()).all(|i| match json[i] { - JString(s) => self.is_valid(i) && (s == &self.value(i).to_string()), - JNull => self.is_null(i), - _ => false, - }) - } -} - -impl PartialEq for Decimal128Array { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &Decimal128Array) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for UnionArray { - fn equals_json(&self, _json: &[&Value]) -> bool { - unimplemented!( - "Added to allow UnionArray to implement the Array trait: see ARROW-8547" - ) - } -} - -impl JsonEqual for NullArray { - fn equals_json(&self, json: &[&Value]) -> bool { - if self.len() != json.len() { - return false; - } - - // all JSON values must be nulls - json.iter().all(|&v| v == &JNull) - } -} - -impl PartialEq for Value { - fn eq(&self, arrow: &NullArray) -> bool { - match self { - Value::Array(json_array) => arrow.equals_json_values(json_array), - _ => false, - } - } -} - -impl PartialEq for NullArray { - fn eq(&self, json: &Value) -> bool { - match json { - Value::Array(json_array) => self.equals_json_values(json_array), - _ => false, - } - } -} - -impl JsonEqual for ArrayRef { - fn equals_json(&self, json: &[&Value]) -> bool { - self.as_ref().equals_json(json) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use crate::error::Result; - use std::{convert::TryFrom, sync::Arc}; - - fn create_list_array, T: AsRef<[Option]>>( - builder: &mut ListBuilder, - data: T, - ) -> Result { - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - builder.append(false); - } - } - Ok(builder.finish()) - } - - /// Create a fixed size list of 2 value lengths - fn create_fixed_size_list_array, T: AsRef<[Option]>>( - builder: &mut FixedSizeListBuilder, - data: T, - ) -> Result { - for d in data.as_ref() { - if let Some(v) = d { - builder.values().append_slice(v.as_ref()); - builder.append(true); - } else { - for _ in 0..builder.value_length() { - builder.values().append_null(); - } - builder.append(false); - } - } - Ok(builder.finish()) - } - - #[test] - fn test_primitive_json_equal() { - // Test equaled array - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, null, 2, 3 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequaled array - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, 1, 2, 3 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - [ - 1, 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test not json array type case - let arrow_array = Int32Array::from(vec![Some(1), None, Some(2), Some(3)]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_list_json_equal() { - // Test equal case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - null, - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - [7, 8], - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = create_list_array( - &mut ListBuilder::new(Int32Builder::new(10)), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_fixed_size_list_json_equal() { - // Test equal case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - null, - [4, 5, 6] - ] - "#, - ) - .unwrap(); - println!("{:?}", arrow_array); - println!("{:?}", json_array); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - [1, 2, 3], - [7, 8, 9], - [4, 5, 6] - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = create_fixed_size_list_array( - &mut FixedSizeListBuilder::new(Int32Builder::new(10), 3), - &[Some(&[1, 2, 3]), None, Some(&[4, 5, 6])], - ) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_string_json_equal() { - // Test the equal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - 1, - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_binary_json_equal() { - // Test the equal case - let mut builder = BinaryBuilder::new(6); - builder.append_value(b"hello"); - builder.append_null(); - builder.append_null(); - builder.append_value(b"world"); - builder.append_null(); - builder.append_null(); - let arrow_array = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None, None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "arrow", - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let arrow_array = - StringArray::from(vec![Some("hello"), None, None, Some("world"), None]); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - 1, - null, - null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_fixed_size_binary_json_equal() { - // Test the equal case - let mut builder = FixedSizeBinaryBuilder::new(15, 5); - builder.append_value(b"hello").unwrap(); - builder.append_null(); - builder.append_value(b"world").unwrap(); - let arrow_array: FixedSizeBinaryArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - "world" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - builder.append_value(b"hello").unwrap(); - builder.append_null(); - builder.append_value(b"world").unwrap(); - let arrow_array: FixedSizeBinaryArray = builder.finish(); - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - "arrow" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - null, - "world" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_decimal_json_equal() { - // Test the equal case - let arrow_array = [Some(1_000), None, Some(-250)] - .iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - "-250" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal case - let arrow_array = [Some(1_000), None, Some(55)] - .iter() - .collect::() - .with_precision_and_scale(23, 6) - .unwrap(); - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - "-250" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - "1000", - null, - null, - "55" - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "a": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect value type case - let json_array: Value = serde_json::from_str( - r#" - [ - "hello", - null, - 1 - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_struct_json_equal() { - let strings: ArrayRef = Arc::new(StringArray::from(vec![ - Some("joe"), - None, - None, - Some("mark"), - Some("doe"), - ])); - let ints: ArrayRef = Arc::new(Int32Array::from(vec![ - Some(1), - Some(2), - None, - Some(4), - Some(5), - ])); - - let arrow_array = - StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]) - .unwrap(); - - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - { - "f2": 2 - }, - null, - { - "f1": "mark", - "f2": 4 - }, - { - "f1": "doe", - "f2": 5 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequal length case - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - { - "f2": 2 - }, - null, - { - "f1": "mark", - "f2": 4 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test incorrect type case - let json_array: Value = serde_json::from_str( - r#" - { - "f1": "joe", - "f2": 1 - } - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - - // Test not all object case - let json_array: Value = serde_json::from_str( - r#" - [ - { - "f1": "joe", - "f2": 1 - }, - 2, - null, - { - "f1": "mark", - "f2": 4 - } - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } - - #[test] - fn test_null_json_equal() { - // Test equaled array - let arrow_array = NullArray::new(4); - let json_array: Value = serde_json::from_str( - r#" - [ - null, null, null, null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.eq(&json_array)); - assert!(json_array.eq(&arrow_array)); - - // Test unequaled array - let arrow_array = NullArray::new(2); - let json_array: Value = serde_json::from_str( - r#" - [ - null, null, null - ] - "#, - ) - .unwrap(); - assert!(arrow_array.ne(&json_array)); - assert!(json_array.ne(&arrow_array)); - } -} diff --git a/arrow/src/array/ffi.rs b/arrow/src/array/ffi.rs index 12d6f440b78d..72030f900a4e 100644 --- a/arrow/src/array/ffi.rs +++ b/arrow/src/array/ffi.rs @@ -25,7 +25,7 @@ use crate::{ ffi::ArrowArrayRef, }; -use super::ArrayData; +use super::{make_array, ArrayData, ArrayRef}; impl TryFrom for ArrayData { type Error = ArrowError; @@ -39,10 +39,46 @@ impl TryFrom for ffi::ArrowArray { type Error = ArrowError; fn try_from(value: ArrayData) -> Result { - unsafe { ffi::ArrowArray::try_new(value) } + ffi::ArrowArray::try_new(value) } } +/// Creates a new array from two FFI pointers. Used to import arrays from the C Data Interface +/// # Safety +/// Assumes that these pointers represent valid C Data Interfaces, both in memory +/// representation and lifetime via the `release` mechanism. +pub unsafe fn make_array_from_raw( + array: *const ffi::FFI_ArrowArray, + schema: *const ffi::FFI_ArrowSchema, +) -> Result { + let array = ffi::ArrowArray::try_from_raw(array, schema)?; + let data = ArrayData::try_from(array)?; + Ok(make_array(data)) +} + +/// Exports an array to raw pointers of the C Data Interface provided by the consumer. +/// # Safety +/// Assumes that these pointers represent valid C Data Interfaces, both in memory +/// representation and lifetime via the `release` mechanism. +/// +/// This function copies the content of two FFI structs [ffi::FFI_ArrowArray] and +/// [ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers. +/// Usually the raw pointers are provided by the array data consumer. +pub unsafe fn export_array_into_raw( + src: ArrayRef, + out_array: *mut ffi::FFI_ArrowArray, + out_schema: *mut ffi::FFI_ArrowSchema, +) -> Result<()> { + let data = src.data(); + let array = ffi::FFI_ArrowArray::new(data); + let schema = ffi::FFI_ArrowSchema::try_from(data.data_type())?; + + std::ptr::write_unaligned(out_array, array); + std::ptr::write_unaligned(out_schema, schema); + + Ok(()) +} + #[cfg(test)] mod tests { use crate::array::{DictionaryArray, FixedSizeListArray, Int32Array, StringArray}; diff --git a/arrow/src/array/iterator.rs b/arrow/src/array/iterator.rs index 8ee9f25447d3..7cc9bde6b4c5 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow/src/array/iterator.rs @@ -16,7 +16,7 @@ // under the License. use crate::array::array::ArrayAccessor; -use crate::array::{BasicDecimalArray, Decimal256Array}; +use crate::array::BasicDecimalArray; use super::{ Array, BooleanArray, Decimal128Array, GenericBinaryArray, GenericListArray, @@ -104,14 +104,15 @@ pub type GenericStringIter<'a, T> = ArrayIter<&'a GenericStringArray>; pub type GenericBinaryIter<'a, T> = ArrayIter<&'a GenericBinaryArray>; pub type GenericListArrayIter<'a, O> = ArrayIter<&'a GenericListArray>; +pub type BasicDecimalIter<'a, const BYTE_WIDTH: usize> = + ArrayIter<&'a BasicDecimalArray>; /// an iterator that returns `Some(Decimal128)` or `None`, that can be used on a /// [`Decimal128Array`] -pub type Decimal128Iter<'a> = ArrayIter<&'a Decimal128Array>; +pub type Decimal128Iter<'a> = BasicDecimalIter<'a, 16>; /// an iterator that returns `Some(Decimal256)` or `None`, that can be used on a -/// [`Decimal256Array`] -pub type Decimal256Iter<'a> = ArrayIter<&'a Decimal256Array>; - +/// [`super::Decimal256Array`] +pub type Decimal256Iter<'a> = BasicDecimalIter<'a, 32>; /// an iterator that returns `Some(i128)` or `None`, that can be used on a /// [`Decimal128Array`] #[derive(Debug)] diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 8acc33c7b879..4a7667741597 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -163,6 +163,8 @@ mod array_binary; mod array_boolean; mod array_decimal; mod array_dictionary; +mod array_fixed_size_binary; +mod array_fixed_size_list; mod array_list; mod array_map; mod array_primitive; @@ -173,7 +175,7 @@ mod builder; mod cast; mod data; mod equal; -mod equal_json; +#[cfg(feature = "ffi")] mod ffi; mod iterator; mod null; @@ -195,18 +197,18 @@ pub use self::data::ArrayDataRef; pub(crate) use self::data::BufferSpec; pub use self::array_binary::BinaryArray; -pub use self::array_binary::FixedSizeBinaryArray; pub use self::array_binary::LargeBinaryArray; pub use self::array_boolean::BooleanArray; pub use self::array_decimal::BasicDecimalArray; pub use self::array_decimal::Decimal128Array; pub use self::array_decimal::Decimal256Array; +pub use self::array_fixed_size_binary::FixedSizeBinaryArray; +pub use self::array_fixed_size_list::FixedSizeListArray; #[deprecated(note = "Please use `Decimal128Array` instead")] pub type DecimalArray = Decimal128Array; -pub use self::array_dictionary::DictionaryArray; -pub use self::array_list::FixedSizeListArray; +pub use self::array_dictionary::{DictionaryArray, TypedDictionaryArray}; pub use self::array_list::LargeListArray; pub use self::array_list::ListArray; pub use self::array_map::MapArray; @@ -594,10 +596,6 @@ pub use self::transform::{Capacities, MutableArrayData}; pub use self::iterator::*; -// --------------------- Array Equality --------------------- - -pub use self::equal_json::JsonEqual; - // --------------------- Array's values comparison --------------------- pub use self::ord::{build_compare, DynComparator}; @@ -613,7 +611,8 @@ pub use self::cast::{ // ------------------------------ C Data Interface --------------------------- -pub use self::array::{export_array_into_raw, make_array_from_raw}; +#[cfg(feature = "ffi")] +pub use self::ffi::{export_array_into_raw, make_array_from_raw}; #[cfg(test)] mod tests { diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index 888c31c5d955..47173aa7d927 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -19,7 +19,6 @@ use std::cmp::Ordering; -use crate::array::BasicDecimalArray; use crate::array::*; use crate::datatypes::TimeUnit; use crate::datatypes::*; @@ -226,7 +225,7 @@ pub fn build_compare(left: &dyn Array, right: &dyn Array) -> Result { + (Decimal128(_, _), Decimal128(_, _)) => { let left: Decimal128Array = Decimal128Array::from(left.data().clone()); let right: Decimal128Array = Decimal128Array::from(right.data().clone()); Box::new(move |i, j| left.value(i).cmp(&right.value(j))) diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 570be29ed336..f0fccef14fd7 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -205,7 +205,7 @@ fn build_extend_dictionary( fn build_extend(array: &ArrayData) -> Extend { use crate::datatypes::*; match array.data_type() { - DataType::Decimal(_, _) => primitive::build_extend::(array), + DataType::Decimal128(_, _) => primitive::build_extend::(array), DataType::Null => null::build_extend(array), DataType::Boolean => boolean::build_extend(array), DataType::UInt8 => primitive::build_extend::(array), @@ -256,7 +256,7 @@ fn build_extend(array: &ArrayData) -> Extend { fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { use crate::datatypes::*; Box::new(match data_type { - DataType::Decimal(_, _) => primitive::extend_nulls::, + DataType::Decimal128(_, _) => primitive::extend_nulls::, DataType::Null => null::extend_nulls, DataType::Boolean => boolean::extend_nulls, DataType::UInt8 => primitive::extend_nulls::, @@ -313,11 +313,7 @@ fn preallocate_offset_and_binary_buffer( // offsets let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::()); // safety: `unsafe` code assumes that this buffer is initialized with one element - if Offset::IS_LARGE { - buffer.push(0i64); - } else { - buffer.push(0i32) - } + buffer.push(Offset::zero()); [ buffer, @@ -410,7 +406,7 @@ impl<'a> MutableArrayData<'a> { }; let child_data = match &data_type { - DataType::Decimal(_, _) + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) | DataType::Null | DataType::Boolean @@ -674,7 +670,6 @@ mod tests { use std::{convert::TryFrom, sync::Arc}; use super::*; - use crate::array::Decimal128Array; use crate::{ array::{ @@ -708,7 +703,7 @@ mod tests { fn test_decimal() { let decimal_array = create_decimal_array(&[Some(1), Some(2), None, Some(3)], 10, 3); - let arrays = vec![decimal_array.data()]; + let arrays = vec![Array::data(&decimal_array)]; let mut a = MutableArrayData::new(arrays, true, 3); a.extend(0, 0, 3); a.extend(0, 2, 3); diff --git a/arrow/src/buffer/immutable.rs b/arrow/src/buffer/immutable.rs index cb686bd8441c..8ec5a4554208 100644 --- a/arrow/src/buffer/immutable.rs +++ b/arrow/src/buffer/immutable.rs @@ -22,7 +22,6 @@ use std::sync::Arc; use std::{convert::AsRef, usize}; use crate::alloc::{Allocation, Deallocation}; -use crate::ffi::FFI_ArrowArray; use crate::util::bit_chunk_iterator::{BitChunks, UnalignedBitChunk}; use crate::{bytes::Bytes, datatypes::ArrowNativeType}; @@ -77,30 +76,6 @@ impl Buffer { Buffer::build_with_arguments(ptr, len, Deallocation::Arrow(capacity)) } - /// Creates a buffer from an existing memory region (must already be byte-aligned), this - /// `Buffer` **does not** free this piece of memory when dropped. - /// - /// # Arguments - /// - /// * `ptr` - Pointer to raw parts - /// * `len` - Length of raw parts in **bytes** - /// * `data` - An [crate::ffi::FFI_ArrowArray] with the data - /// - /// # Safety - /// - /// This function is unsafe as there is no guarantee that the given pointer is valid for `len` - /// bytes and that the foreign deallocator frees the region. - #[deprecated( - note = "use from_custom_allocation instead which makes it clearer that the allocation is in fact owned" - )] - pub unsafe fn from_unowned( - ptr: NonNull, - len: usize, - data: Arc, - ) -> Self { - Self::from_custom_allocation(ptr, len, data) - } - /// Creates a buffer from an existing memory region. Ownership of the memory is tracked via reference counting /// and the memory will be freed using the `drop` method of [crate::alloc::Allocation] when the reference count reaches zero. /// diff --git a/arrow/src/compute/README.md b/arrow/src/compute/README.md index 761713a531b4..a5d15a83046f 100644 --- a/arrow/src/compute/README.md +++ b/arrow/src/compute/README.md @@ -33,16 +33,16 @@ We use the term "kernel" to refer to particular general operation that contains Types of functions -* Scalar functions: elementwise functions that perform scalar operations in a +- Scalar functions: elementwise functions that perform scalar operations in a vectorized manner. These functions are generally valid for SQL-like context. These are called "scalar" in that the functions executed consider each value in an array independently, and the output array or arrays have the same length as the input arrays. The result for each array cell is generally independent of its position in the array. -* Vector functions, which produce a result whose output is generally dependent +- Vector functions, which produce a result whose output is generally dependent on the entire contents of the input arrays. These functions **are generally not valid** for SQL-like processing because the output size may be different than the input size, and the result may change based on the order of the values in the array. This includes things like array subselection, sorting, hashing, and more. -* Scalar aggregate functions of which can be used in a SQL-like context \ No newline at end of file +- Scalar aggregate functions of which can be used in a SQL-like context diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 25aa525b4520..ddca0c2e9351 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -35,6 +35,8 @@ //! assert_eq!(7.0, c.value(2)); //! ``` +use chrono::Timelike; +use std::ops::{Div, Mul}; use std::str; use std::sync::Arc; @@ -45,10 +47,14 @@ use crate::compute::kernels::arity::unary; use crate::compute::kernels::cast_utils::string_to_timestamp_nanos; use crate::datatypes::*; use crate::error::{ArrowError, Result}; +use crate::temporal_conversions::{ + EPOCH_DAYS_FROM_CE, MICROSECONDS, MILLISECONDS, MILLISECONDS_IN_DAY, NANOSECONDS, + SECONDS_IN_DAY, +}; use crate::{array::*, compute::take}; use crate::{buffer::Buffer, util::serialization::lexical_to_string}; use num::cast::AsPrimitive; -use num::{NumCast, ToPrimitive}; +use num::{BigInt, NumCast, ToPrimitive}; /// CastOptions provides a way to override the default cast behaviors #[derive(Debug)] @@ -72,11 +78,14 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { match (from_type, to_type) { // TODO UTF8/unsigned numeric to decimal // cast one decimal type to another decimal type - (Decimal(_, _), Decimal(_, _)) => true, + (Decimal128(_, _), Decimal128(_, _)) => true, + (Decimal256(_, _), Decimal256(_, _)) => true, + (Decimal128(_, _), Decimal256(_, _)) => true, + (Decimal256(_, _), Decimal128(_, _)) => true, // signed numeric to decimal - (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal(_, _)) | + (Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, Decimal128(_, _)) | // decimal to signed numeric - (Decimal(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) + (Decimal128(_, _), Null | Int8 | Int16 | Int32 | Int64 | Float32 | Float64) | ( Null, Boolean @@ -109,8 +118,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Map(_, _) | Dictionary(_, _) ) => true, - (Decimal(_, _), _) => false, - (_, Decimal(_, _)) => false, + (Decimal128(_, _), _) => false, + (_, Decimal128(_, _)) => false, (Struct(_), _) => false, (_, Struct(_)) => false, (LargeList(list_from), LargeList(list_to)) => { @@ -136,9 +145,25 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8, LargeUtf8) => true, (LargeUtf8, Utf8) => true, - (Utf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None)) => true, + (Utf8, + Date32 + | Date64 + | Time32(TimeUnit::Second) + | Time32(TimeUnit::Millisecond) + | Time64(TimeUnit::Microsecond) + | Time64(TimeUnit::Nanosecond) + | Timestamp(TimeUnit::Nanosecond, None) + ) => true, (Utf8, _) => DataType::is_numeric(to_type), - (LargeUtf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None)) => true, + (LargeUtf8, + Date32 + | Date64 + | Time32(TimeUnit::Second) + | Time32(TimeUnit::Millisecond) + | Time64(TimeUnit::Microsecond) + | Time64(TimeUnit::Nanosecond) + | Timestamp(TimeUnit::Nanosecond, None) + ) => true, (LargeUtf8, _) => DataType::is_numeric(to_type), (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true, (Date32, Utf8) | (Date32, LargeUtf8) => true, @@ -410,8 +435,19 @@ pub fn cast_with_options( return Ok(array.clone()); } match (from_type, to_type) { - (Decimal(_, s1), Decimal(p2, s2)) => cast_decimal_to_decimal(array, s1, p2, s2), - (Decimal(_, scale), _) => { + (Decimal128(_, s1), Decimal128(p2, s2)) => { + cast_decimal_to_decimal::<16, 16>(array, s1, p2, s2) + } + (Decimal256(_, s1), Decimal256(p2, s2)) => { + cast_decimal_to_decimal::<32, 32>(array, s1, p2, s2) + } + (Decimal128(_, s1), Decimal256(p2, s2)) => { + cast_decimal_to_decimal::<16, 32>(array, s1, p2, s2) + } + (Decimal256(_, s1), Decimal128(p2, s2)) => { + cast_decimal_to_decimal::<32, 16>(array, s1, p2, s2) + } + (Decimal128(_, scale), _) => { // cast decimal to other type match to_type { Int8 => { @@ -439,7 +475,7 @@ pub fn cast_with_options( ))), } } - (_, Decimal(precision, scale)) => { + (_, Decimal128(precision, scale)) => { // cast data to decimal match from_type { // TODO now just support signed numeric to decimal, support decimal to numeric later @@ -657,6 +693,18 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), + Time32(TimeUnit::Second) => { + cast_string_to_time32second::(&**array, cast_options) + } + Time32(TimeUnit::Millisecond) => { + cast_string_to_time32millisecond::(&**array, cast_options) + } + Time64(TimeUnit::Microsecond) => { + cast_string_to_time64microsecond::(&**array, cast_options) + } + Time64(TimeUnit::Nanosecond) => { + cast_string_to_time64nanosecond::(&**array, cast_options) + } Timestamp(TimeUnit::Nanosecond, None) => { cast_string_to_timestamp_ns::(&**array, cast_options) } @@ -791,6 +839,18 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(&**array, cast_options), Date64 => cast_string_to_date64::(&**array, cast_options), + Time32(TimeUnit::Second) => { + cast_string_to_time32second::(&**array, cast_options) + } + Time32(TimeUnit::Millisecond) => { + cast_string_to_time32millisecond::(&**array, cast_options) + } + Time64(TimeUnit::Microsecond) => { + cast_string_to_time64microsecond::(&**array, cast_options) + } + Time64(TimeUnit::Nanosecond) => { + cast_string_to_time64nanosecond::(&**array, cast_options) + } Timestamp(TimeUnit::Nanosecond, None) => { cast_string_to_timestamp_ns::(&**array, cast_options) } @@ -1204,48 +1264,124 @@ const fn time_unit_multiple(unit: &TimeUnit) -> i64 { } } -/// Number of seconds in a day -const SECONDS_IN_DAY: i64 = 86_400; -/// Number of milliseconds in a second -const MILLISECONDS: i64 = 1_000; -/// Number of microseconds in a second -const MICROSECONDS: i64 = 1_000_000; -/// Number of nanoseconds in a second -const NANOSECONDS: i64 = 1_000_000_000; -/// Number of milliseconds in a day -const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS; -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - /// Cast one type of decimal array to another type of decimal array -fn cast_decimal_to_decimal( +fn cast_decimal_to_decimal( array: &ArrayRef, input_scale: &usize, output_precision: &usize, output_scale: &usize, ) -> Result { - let array = array.as_any().downcast_ref::().unwrap(); - - let output_array = if input_scale > output_scale { + if input_scale > output_scale { // For example, input_scale is 4 and output_scale is 3; // Original value is 11234_i128, and will be cast to 1123_i128. let div = 10_i128.pow((input_scale - output_scale) as u32); - array - .iter() - .map(|v| v.map(|v| v.as_i128() / div)) - .collect::() + if BYTE_WIDTH1 == 16 { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.as_i128() / div)); + if BYTE_WIDTH2 == 16 { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .map(|v| v.map(BigInt::from)) + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } + } else { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.to_big_int().div(div))); + if BYTE_WIDTH2 == 16 { + let values = iter + .map(|v| { + if v.is_none() { + Ok(None) + } else { + v.as_ref().and_then(|v| v.to_i128()) + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) + }) + .map(Some) + } + }) + .collect::>>()?; + + let output_array = values + .into_iter() + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } + } } else { // For example, input_scale is 3 and output_scale is 4; // Original value is 1123_i128, and will be cast to 11230_i128. let mul = 10_i128.pow((output_scale - input_scale) as u32); - array - .iter() - .map(|v| v.map(|v| v.as_i128() * mul)) - .collect::() - } - .with_precision_and_scale(*output_precision, *output_scale)?; + if BYTE_WIDTH1 == 16 { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.as_i128() * mul)); + if BYTE_WIDTH2 == 16 { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .map(|v| v.map(BigInt::from)) + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; - Ok(Arc::new(output_array)) + Ok(Arc::new(output_array)) + } + } else { + let array = array.as_any().downcast_ref::().unwrap(); + let iter = array.iter().map(|v| v.map(|v| v.to_big_int().mul(mul))); + if BYTE_WIDTH2 == 16 { + let values = iter + .map(|v| { + if v.is_none() { + Ok(None) + } else { + v.as_ref().and_then(|v| v.to_i128()) + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + format!("{:?} cannot be casted to 128-bit integer for Decimal128", v), + ) + }) + .map(Some) + } + }) + .collect::>>()?; + + let output_array = values + .into_iter() + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } else { + let output_array = iter + .collect::() + .with_precision_and_scale(*output_precision, *output_scale)?; + + Ok(Arc::new(output_array)) + } + } + } } /// Cast an array by changing its array_data type to the desired type @@ -1420,35 +1556,28 @@ where ::Native: lexical_core::FromLexical, { if cast_options.safe { - let iter = (0..from.len()).map(|i| { - if from.is_null(i) { - None - } else { - lexical_core::parse(from.value(i).as_bytes()).ok() - } - }); + let iter = from + .iter() + .map(|v| v.and_then(|v| lexical_core::parse(v.as_bytes()).ok())); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. Ok(unsafe { PrimitiveArray::::from_trusted_len_iter(iter) }) } else { - let vec = (0..from.len()) - .map(|i| { - if from.is_null(i) { - Ok(None) - } else { - let string = from.value(i); - let result = lexical_core::parse(string.as_bytes()); - Some(result.map_err(|_| { + let vec = from + .iter() + .map(|v| { + v.map(|v| { + lexical_core::parse(v.as_bytes()).map_err(|_| { ArrowError::CastError(format!( - "Cannot cast string '{}' to value of {} type", - string, - std::any::type_name::() + "Cannot cast string '{}' to value of {:?} type", + v, + T::DATA_TYPE, )) - })) - .transpose() - } + }) + }) + .transpose() }) .collect::>>()?; // Benefit: @@ -1471,16 +1600,12 @@ fn cast_string_to_date32( .unwrap(); let array = if cast_options.safe { - let iter = (0..string_array.len()).map(|i| { - if string_array.is_null(i) { - None - } else { - string_array - .value(i) - .parse::() + let iter = string_array.iter().map(|v| { + v.and_then(|v| { + v.parse::() .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) .ok() - } + }) }); // Benefit: @@ -1489,25 +1614,21 @@ fn cast_string_to_date32( // The iterator is trustedLen because it comes from an `StringArray`. unsafe { Date32Array::from_trusted_len_iter(iter) } } else { - let vec = (0..string_array.len()) - .map(|i| { - if string_array.is_null(i) { - Ok(None) - } else { - let string = string_array - .value(i); - - let result = string - .parse::() - .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE); - - Some(result.map_err(|_| { - ArrowError::CastError( - format!("Cannot cast string '{}' to value of arrow::datatypes::types::Date32Type type", string), - ) - })) - .transpose() - } + let vec = string_array + .iter() + .map(|v| { + v.map(|v| { + v.parse::() + .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + DataType::Date32 + )) + }) + }) + .transpose() }) .collect::>>>()?; @@ -1532,16 +1653,12 @@ fn cast_string_to_date64( .unwrap(); let array = if cast_options.safe { - let iter = (0..string_array.len()).map(|i| { - if string_array.is_null(i) { - None - } else { - string_array - .value(i) - .parse::() + let iter = string_array.iter().map(|v| { + v.and_then(|v| { + v.parse::() .map(|datetime| datetime.timestamp_millis()) .ok() - } + }) }); // Benefit: @@ -1550,25 +1667,21 @@ fn cast_string_to_date64( // The iterator is trustedLen because it comes from an `StringArray`. unsafe { Date64Array::from_trusted_len_iter(iter) } } else { - let vec = (0..string_array.len()) - .map(|i| { - if string_array.is_null(i) { - Ok(None) - } else { - let string = string_array - .value(i); - - let result = string - .parse::() - .map(|datetime| datetime.timestamp_millis()); - - Some(result.map_err(|_| { - ArrowError::CastError( - format!("Cannot cast string '{}' to value of arrow::datatypes::types::Date64Type type", string), - ) - })) - .transpose() - } + let vec = string_array + .iter() + .map(|v| { + v.map(|v| { + v.parse::() + .map(|datetime| datetime.timestamp_millis()) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + DataType::Date64 + )) + }) + }) + .transpose() }) .collect::>>>()?; @@ -1582,6 +1695,262 @@ fn cast_string_to_date64( Ok(Arc::new(array) as ArrayRef) } +/// Casts generic string arrays to `Time32SecondArray` +fn cast_string_to_time32second( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + /// The number of nanoseconds per millisecond. + const NANOS_PER_SEC: u32 = 1_000_000_000; + + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let array = if cast_options.safe { + let iter = string_array.iter().map(|v| { + v.and_then(|v| { + v.parse::() + .map(|time| { + (time.num_seconds_from_midnight() + + time.nanosecond() / NANOS_PER_SEC) + as i32 + }) + .ok() + }) + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time32SecondArray::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| { + v.map(|v| { + v.parse::() + .map(|time| { + (time.num_seconds_from_midnight() + + time.nanosecond() / NANOS_PER_SEC) + as i32 + }) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + DataType::Time32(TimeUnit::Second) + )) + }) + }) + .transpose() + }) + .collect::>>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time32SecondArray::from_trusted_len_iter(vec.iter()) } + }; + + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to `Time32MillisecondArray` +fn cast_string_to_time32millisecond( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + /// The number of nanoseconds per millisecond. + const NANOS_PER_MILLI: u32 = 1_000_000; + /// The number of milliseconds per second. + const MILLIS_PER_SEC: u32 = 1_000; + + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let array = if cast_options.safe { + let iter = string_array.iter().map(|v| { + v.and_then(|v| { + v.parse::() + .map(|time| { + (time.num_seconds_from_midnight() * MILLIS_PER_SEC + + time.nanosecond() / NANOS_PER_MILLI) + as i32 + }) + .ok() + }) + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time32MillisecondArray::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| { + v.map(|v| { + v.parse::() + .map(|time| { + (time.num_seconds_from_midnight() * MILLIS_PER_SEC + + time.nanosecond() / NANOS_PER_MILLI) + as i32 + }) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + DataType::Time32(TimeUnit::Millisecond) + )) + }) + }) + .transpose() + }) + .collect::>>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time32MillisecondArray::from_trusted_len_iter(vec.iter()) } + }; + + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to `Time64MicrosecondArray` +fn cast_string_to_time64microsecond( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + /// The number of nanoseconds per microsecond. + const NANOS_PER_MICRO: i64 = 1_000; + /// The number of microseconds per second. + const MICROS_PER_SEC: i64 = 1_000_000; + + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let array = if cast_options.safe { + let iter = string_array.iter().map(|v| { + v.and_then(|v| { + v.parse::() + .map(|time| { + time.num_seconds_from_midnight() as i64 * MICROS_PER_SEC + + time.nanosecond() as i64 / NANOS_PER_MICRO + }) + .ok() + }) + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time64MicrosecondArray::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| { + v.map(|v| { + v.parse::() + .map(|time| { + time.num_seconds_from_midnight() as i64 * MICROS_PER_SEC + + time.nanosecond() as i64 / NANOS_PER_MICRO + }) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + DataType::Time64(TimeUnit::Microsecond) + )) + }) + }) + .transpose() + }) + .collect::>>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time64MicrosecondArray::from_trusted_len_iter(vec.iter()) } + }; + + Ok(Arc::new(array) as ArrayRef) +} + +/// Casts generic string arrays to `Time64NanosecondArray` +fn cast_string_to_time64nanosecond( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + /// The number of nanoseconds per second. + const NANOS_PER_SEC: i64 = 1_000_000_000; + + let string_array = array + .as_any() + .downcast_ref::>() + .unwrap(); + + let array = if cast_options.safe { + let iter = string_array.iter().map(|v| { + v.and_then(|v| { + v.parse::() + .map(|time| { + time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC + + time.nanosecond() as i64 + }) + .ok() + }) + }); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time64NanosecondArray::from_trusted_len_iter(iter) } + } else { + let vec = string_array + .iter() + .map(|v| { + v.map(|v| { + v.parse::() + .map(|time| { + time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC + + time.nanosecond() as i64 + }) + .map_err(|_| { + ArrowError::CastError(format!( + "Cannot cast string '{}' to value of {:?} type", + v, + DataType::Time64(TimeUnit::Nanosecond) + )) + }) + }) + .transpose() + }) + .collect::>>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { Time64NanosecondArray::from_trusted_len_iter(vec.iter()) } + }; + + Ok(Arc::new(array) as ArrayRef) +} + /// Casts generic string arrays to TimeStampNanosecondArray fn cast_string_to_timestamp_ns( array: &dyn Array, @@ -1593,28 +1962,18 @@ fn cast_string_to_timestamp_ns( .unwrap(); let array = if cast_options.safe { - let iter = (0..string_array.len()).map(|i| { - if string_array.is_null(i) { - None - } else { - string_to_timestamp_nanos(string_array.value(i)).ok() - } - }); + let iter = string_array + .iter() + .map(|v| v.and_then(|v| string_to_timestamp_nanos(v).ok())); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) } } else { - let vec = (0..string_array.len()) - .map(|i| { - if string_array.is_null(i) { - Ok(None) - } else { - let result = string_to_timestamp_nanos(string_array.value(i)); - Some(result).transpose() - } - }) + let vec = string_array + .iter() + .map(|v| v.map(string_to_timestamp_nanos).transpose()) .collect::>>>()?; // Benefit: @@ -2163,8 +2522,8 @@ where #[cfg(test)] mod tests { use super::*; - use crate::array::BasicDecimalArray; - use crate::util::decimal::Decimal128; + use crate::datatypes::TimeUnit; + use crate::util::decimal::{Decimal128, Decimal256}; use crate::{buffer::Buffer, util::display::array_value_to_string}; macro_rules! generate_cast_test_case { @@ -2203,10 +2562,21 @@ mod tests { .with_precision_and_scale(precision, scale) } + fn create_decimal256_array( + array: Vec>, + precision: usize, + scale: usize, + ) -> Result { + array + .into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + } + #[test] - fn test_cast_decimal_to_decimal() { - let input_type = DataType::Decimal(20, 3); - let output_type = DataType::Decimal(20, 4); + fn test_cast_decimal128_to_decimal128() { + let input_type = DataType::Decimal128(20, 3); + let output_type = DataType::Decimal128(20, 4); assert!(can_cast_types(&input_type, &output_type)); let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; let input_decimal_array = create_decimal_array(&array, 20, 3).unwrap(); @@ -2226,15 +2596,106 @@ mod tests { let array = vec![Some(123456), None]; let input_decimal_array = create_decimal_array(&array, 10, 0).unwrap(); let array = Arc::new(input_decimal_array) as ArrayRef; - let result = cast(&array, &DataType::Decimal(2, 2)); + let result = cast(&array, &DataType::Decimal128(2, 2)); assert!(result.is_err()); - assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal of precision 2. Max is 99", + assert_eq!("Invalid argument error: 12345600 is too large to store in a Decimal128 of precision 2. Max is 99", result.unwrap_err().to_string()); } + #[test] + fn test_cast_decimal128_to_decimal256() { + let input_type = DataType::Decimal128(20, 3); + let output_type = DataType::Decimal256(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![Some(1123456), Some(2123456), Some(3123456), None]; + let input_decimal_array = create_decimal_array(&array, 20, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some( + Decimal256::from_big_int(&BigInt::from(11234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(21234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(31234560_i128), 20, 4) + .unwrap() + ), + None + ] + ); + } + + #[test] + fn test_cast_decimal256_to_decimal128() { + let input_type = DataType::Decimal256(20, 3); + let output_type = DataType::Decimal128(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![ + Some(BigInt::from(1123456)), + Some(BigInt::from(2123456)), + Some(BigInt::from(3123456)), + None, + ]; + let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal128Array, + &output_type, + vec![ + Some(Decimal128::new_from_i128(20, 4, 11234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 21234560_i128)), + Some(Decimal128::new_from_i128(20, 4, 31234560_i128)), + None + ] + ); + } + + #[test] + fn test_cast_decimal256_to_decimal256() { + let input_type = DataType::Decimal256(20, 3); + let output_type = DataType::Decimal256(20, 4); + assert!(can_cast_types(&input_type, &output_type)); + let array = vec![ + Some(BigInt::from(1123456)), + Some(BigInt::from(2123456)), + Some(BigInt::from(3123456)), + None, + ]; + let input_decimal_array = create_decimal256_array(array, 20, 3).unwrap(); + let array = Arc::new(input_decimal_array) as ArrayRef; + generate_cast_test_case!( + &array, + Decimal256Array, + &output_type, + vec![ + Some( + Decimal256::from_big_int(&BigInt::from(11234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(21234560_i128), 20, 4) + .unwrap() + ), + Some( + Decimal256::from_big_int(&BigInt::from(31234560_i128), 20, 4) + .unwrap() + ), + None + ] + ); + } + #[test] fn test_cast_decimal_to_numeric() { - let decimal_type = DataType::Decimal(38, 2); + let decimal_type = DataType::Decimal128(38, 2); // negative test assert!(!can_cast_types(&decimal_type, &DataType::UInt8)); let value_array: Vec> = @@ -2355,7 +2816,7 @@ mod tests { #[test] fn test_cast_numeric_to_decimal() { // test negative cast type - let decimal_type = DataType::Decimal(38, 6); + let decimal_type = DataType::Decimal128(38, 6); assert!(!can_cast_types(&DataType::UInt64, &decimal_type)); // i8, i16, i32, i64 @@ -2408,9 +2869,9 @@ mod tests { // the 100 will be converted to 1000_i128, but it is out of range for max value in the precision 3. let array = Int8Array::from(vec![1, 2, 3, 4, 100]); let array = Arc::new(array) as ArrayRef; - let casted_array = cast(&array, &DataType::Decimal(3, 1)); + let casted_array = cast(&array, &DataType::Decimal128(3, 1)); assert!(casted_array.is_err()); - assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal of precision 3. Max is 999", casted_array.unwrap_err().to_string()); + assert_eq!("Invalid argument error: 1000 is too large to store in a Decimal128 of precision 3. Max is 999", casted_array.unwrap_err().to_string()); // test f32 to decimal type let array = Float32Array::from(vec![ @@ -2623,9 +3084,13 @@ mod tests { match result { Ok(_) => panic!("expected error"), Err(e) => { - assert!(e.to_string().contains( - "Cast error: Cannot cast string 'seven' to value of arrow::datatypes::types::Int32Type type" - )) + assert!( + e.to_string().contains( + "Cast error: Cannot cast string 'seven' to value of Int32 type", + ), + "Error: {}", + e + ) } } } @@ -2819,8 +3284,8 @@ mod tests { None, ])) as ArrayRef; for array in &[a1, a2] { - let b = - cast(array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap(); + let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); + let b = cast(array, &to_type).unwrap(); let c = b .as_any() .downcast_ref::() @@ -2828,6 +3293,13 @@ mod tests { assert_eq!(1599566400000000000, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); + + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!( + err.to_string(), + "Cast error: Error parsing 'Not a valid date' as timestamp" + ); } } @@ -2844,11 +3316,132 @@ mod tests { None, ])) as ArrayRef; for array in &[a1, a2] { - let b = cast(array, &DataType::Date32).unwrap(); + let to_type = DataType::Date32; + let b = cast(array, &to_type).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(17890, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); + + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date32 type"); + } + } + + #[test] + fn test_cast_string_to_time32second() { + let a1 = Arc::new(StringArray::from(vec![ + Some("08:08:35.091323414"), + Some("08:08:60.091323414"), // leap second + Some("08:08:61.091323414"), // not valid + Some("Not a valid time"), + None, + ])) as ArrayRef; + let a2 = Arc::new(LargeStringArray::from(vec![ + Some("08:08:35.091323414"), + Some("08:08:60.091323414"), // leap second + Some("08:08:61.091323414"), // not valid + Some("Not a valid time"), + None, + ])) as ArrayRef; + for array in &[a1, a2] { + let to_type = DataType::Time32(TimeUnit::Second); + let b = cast(array, &to_type).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(29315, c.value(0)); + assert_eq!(29340, c.value(1)); + assert!(c.is_null(2)); + assert!(c.is_null(3)); + assert!(c.is_null(4)); + + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!(err.to_string(), "Cast error: Cannot cast string '08:08:61.091323414' to value of Time32(Second) type"); + } + } + + #[test] + fn test_cast_string_to_time32millisecond() { + let a1 = Arc::new(StringArray::from(vec![ + Some("08:08:35.091323414"), + Some("08:08:60.091323414"), // leap second + Some("08:08:61.091323414"), // not valid + Some("Not a valid time"), + None, + ])) as ArrayRef; + let a2 = Arc::new(LargeStringArray::from(vec![ + Some("08:08:35.091323414"), + Some("08:08:60.091323414"), // leap second + Some("08:08:61.091323414"), // not valid + Some("Not a valid time"), + None, + ])) as ArrayRef; + for array in &[a1, a2] { + let to_type = DataType::Time32(TimeUnit::Millisecond); + let b = cast(array, &to_type).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(29315091, c.value(0)); + assert_eq!(29340091, c.value(1)); + assert!(c.is_null(2)); + assert!(c.is_null(3)); + assert!(c.is_null(4)); + + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!(err.to_string(), "Cast error: Cannot cast string '08:08:61.091323414' to value of Time32(Millisecond) type"); + } + } + + #[test] + fn test_cast_string_to_time64microsecond() { + let a1 = Arc::new(StringArray::from(vec![ + Some("08:08:35.091323414"), + Some("Not a valid time"), + None, + ])) as ArrayRef; + let a2 = Arc::new(LargeStringArray::from(vec![ + Some("08:08:35.091323414"), + Some("Not a valid time"), + None, + ])) as ArrayRef; + for array in &[a1, a2] { + let to_type = DataType::Time64(TimeUnit::Microsecond); + let b = cast(array, &to_type).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(29315091323, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid time' to value of Time64(Microsecond) type"); + } + } + + #[test] + fn test_cast_string_to_time64nanosecond() { + let a1 = Arc::new(StringArray::from(vec![ + Some("08:08:35.091323414"), + Some("Not a valid time"), + None, + ])) as ArrayRef; + let a2 = Arc::new(LargeStringArray::from(vec![ + Some("08:08:35.091323414"), + Some("Not a valid time"), + None, + ])) as ArrayRef; + for array in &[a1, a2] { + let to_type = DataType::Time64(TimeUnit::Nanosecond); + let b = cast(array, &to_type).unwrap(); + let c = b.as_any().downcast_ref::().unwrap(); + assert_eq!(29315091323414, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid time' to value of Time64(Nanosecond) type"); } } @@ -2865,11 +3458,16 @@ mod tests { None, ])) as ArrayRef; for array in &[a1, a2] { - let b = cast(array, &DataType::Date64).unwrap(); + let to_type = DataType::Date64; + let b = cast(array, &to_type).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(1599566400000, c.value(0)); assert!(c.is_null(1)); assert!(c.is_null(2)); + + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!(err.to_string(), "Cast error: Cannot cast string 'Not a valid date' to value of Date64 type"); } } @@ -4282,7 +4880,7 @@ mod tests { #[test] fn test_cast_null_array_to_from_decimal_array() { - let data_type = DataType::Decimal(12, 4); + let data_type = DataType::Decimal128(12, 4); let array = new_null_array(&DataType::Null, 4); assert_eq!(array.data_type(), &DataType::Null); let cast_array = cast(&array, &data_type).expect("cast failed"); @@ -4804,7 +5402,7 @@ mod tests { Dictionary(Box::new(DataType::Int8), Box::new(DataType::Int32)), Dictionary(Box::new(DataType::Int16), Box::new(DataType::Utf8)), Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)), - Decimal(38, 0), + Decimal128(38, 0), ] } diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs index 7733ce67a76e..1d0bc938ece9 100644 --- a/arrow/src/compute/kernels/comparison.rs +++ b/arrow/src/compute/kernels/comparison.rs @@ -35,7 +35,7 @@ use crate::datatypes::{ }; use crate::error::{ArrowError, Result}; use crate::util::bit_util; -use regex::{escape, Regex}; +use regex::Regex; use std::collections::HashMap; /// Helper function to perform boolean lambda function on values from two array accessors, this @@ -169,7 +169,7 @@ where let re = if let Some(ref regex) = map.get(pat) { regex } else { - let re_pattern = escape(pat).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(pat)?; let re = op(&re_pattern)?; map.insert(pat, re); map.get(pat).unwrap() @@ -248,7 +248,9 @@ pub fn like_utf8_scalar( bit_util::set_bit(bool_slice, i); } } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use starts_with let starts_with = &right[..right.len() - 1]; @@ -266,7 +268,7 @@ pub fn like_utf8_scalar( } } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {}", @@ -296,6 +298,43 @@ pub fn like_utf8_scalar( Ok(BooleanArray::from(data)) } +/// Transforms a like `pattern` to a regex compatible pattern. To achieve that, it does: +/// +/// 1. Replace like wildcards for regex expressions as the pattern will be evaluated using regex match: `%` => `.*` and `_` => `.` +/// 2. Escape regex meta characters to match them and not be evaluated as regex special chars. For example: `.` => `\\.` +/// 3. Replace escaped like wildcards removing the escape characters to be able to match it as a regex. For example: `\\%` => `%` +fn replace_like_wildcards(pattern: &str) -> Result { + let mut result = String::new(); + let pattern = String::from(pattern); + let mut chars_iter = pattern.chars().peekable(); + while let Some(c) = chars_iter.next() { + if c == '\\' { + let next = chars_iter.peek(); + match next { + Some(next) if is_like_pattern(*next) => { + result.push(*next); + // Skipping the next char as it is already appended + chars_iter.next(); + } + _ => { + result.push('\\'); + result.push('\\'); + } + } + } else if regex_syntax::is_meta_character(c) { + result.push('\\'); + result.push(c); + } else if c == '%' { + result.push_str(".*"); + } else if c == '_' { + result.push('.'); + } else { + result.push(c); + } + } + Ok(result) +} + /// Perform SQL `left NOT LIKE right` operation on [`StringArray`] / /// [`LargeStringArray`]. /// @@ -330,7 +369,9 @@ pub fn nlike_utf8_scalar( for i in 0..left.len() { result.append(left.value(i) != right); } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use ends_with for i in 0..left.len() { @@ -342,7 +383,7 @@ pub fn nlike_utf8_scalar( result.append(!left.value(i).ends_with(&right[1..])); } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from LIKE pattern: {}", @@ -403,7 +444,9 @@ pub fn ilike_utf8_scalar( for i in 0..left.len() { result.append(left.value(i) == right); } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use ends_with for i in 0..left.len() { @@ -423,7 +466,7 @@ pub fn ilike_utf8_scalar( ); } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from ILIKE pattern: {}", @@ -484,7 +527,9 @@ pub fn nilike_utf8_scalar( for i in 0..left.len() { result.append(left.value(i) != right); } - } else if right.ends_with('%') && !right[..right.len() - 1].contains(is_like_pattern) + } else if right.ends_with('%') + && !right.ends_with("\\%") + && !right[..right.len() - 1].contains(is_like_pattern) { // fast path, can use ends_with for i in 0..left.len() { @@ -506,7 +551,7 @@ pub fn nilike_utf8_scalar( ); } } else { - let re_pattern = escape(right).replace('%', ".*").replace('_', "."); + let re_pattern = replace_like_wildcards(right)?; let re = Regex::new(&format!("(?i)^{}$", re_pattern)).map_err(|e| { ArrowError::ComputeError(format!( "Unable to build regex from ILIKE pattern: {}", @@ -942,18 +987,19 @@ pub fn gt_eq_utf8_scalar( compare_op_scalar(left, |a| a >= right) } +// Avoids creating a closure for each combination of `$RIGHT` and `$TY` +fn try_to_type_result(value: Option, right: &str, ty: &str) -> Result { + value.ok_or_else(|| { + ArrowError::ComputeError(format!("Could not convert {} with {}", right, ty,)) + }) +} + /// Calls $RIGHT.$TY() (e.g. `right.to_i128()`) with a nice error message. /// Type of expression is `Result<.., ArrowError>` macro_rules! try_to_type { - ($RIGHT: expr, $TY: ident) => {{ - $RIGHT.$TY().ok_or_else(|| { - ArrowError::ComputeError(format!( - "Could not convert {} with {}", - stringify!($RIGHT), - stringify!($TY) - )) - }) - }}; + ($RIGHT: expr, $TY: ident) => { + try_to_type_result($RIGHT.$TY(), stringify!($RIGHT), stringify!($TYPE)) + }; } macro_rules! dyn_compare_scalar { @@ -1023,59 +1069,35 @@ macro_rules! dyn_compare_scalar { match $KT.as_ref() { DataType::UInt8 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::UInt16 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::UInt32 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::UInt64 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int8 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int16 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int32 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } DataType::Int64 => { let left = as_dictionary_array::($LEFT); - unpack_dict_comparison( - left, - dyn_compare_scalar!(left.values(), $RIGHT, $OP)?, - ) + unpack_dict_comparison(left, $OP(left.values(), $RIGHT)?) } _ => Err(ArrowError::ComputeError(format!( "Unsupported dictionary key type {:?}", @@ -1141,7 +1163,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, eq_scalar) + dyn_compare_scalar!(left, right, key_type, eq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, eq_scalar), } @@ -1155,7 +1177,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, lt_scalar) + dyn_compare_scalar!(left, right, key_type, lt_dyn_scalar) } _ => dyn_compare_scalar!(left, right, lt_scalar), } @@ -1169,7 +1191,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, lt_eq_scalar) + dyn_compare_scalar!(left, right, key_type, lt_eq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, lt_eq_scalar), } @@ -1183,7 +1205,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, gt_scalar) + dyn_compare_scalar!(left, right, key_type, gt_dyn_scalar) } _ => dyn_compare_scalar!(left, right, gt_scalar), } @@ -1197,7 +1219,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, gt_eq_scalar) + dyn_compare_scalar!(left, right, key_type, gt_eq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, gt_eq_scalar), } @@ -1211,7 +1233,7 @@ where { match left.data_type() { DataType::Dictionary(key_type, _value_type) => { - dyn_compare_scalar!(left, right, key_type, neq_scalar) + dyn_compare_scalar!(left, right, key_type, neq_dyn_scalar) } _ => dyn_compare_scalar!(left, right, neq_scalar), } @@ -3740,6 +3762,50 @@ mod tests { vec![false, true, false, false] ); + test_utf8_scalar!( + test_utf8_scalar_like_escape, + vec!["a%", "a\\x"], + "a\\%", + like_utf8_scalar, + vec![true, false] + ); + + test_utf8!( + test_utf8_scalar_ilike_regex, + vec!["%%%"], + vec![r#"\%_\%"#], + ilike_utf8, + vec![true] + ); + + #[test] + fn test_replace_like_wildcards() { + let a_eq = "_%"; + let expected = "..*"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_leave_like_meta_chars() { + let a_eq = "\\%\\_"; + let expected = "%_"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_with_multiple_escape_chars() { + let a_eq = "\\\\%"; + let expected = "\\\\%"; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + + #[test] + fn test_replace_like_wildcards_escape_regex_meta_char() { + let a_eq = "."; + let expected = "\\."; + assert_eq!(replace_like_wildcards(a_eq).unwrap(), expected); + } + test_utf8!( test_utf8_array_eq, vec!["arrow", "arrow", "arrow", "arrow"], diff --git a/arrow/src/compute/kernels/concat_elements.rs b/arrow/src/compute/kernels/concat_elements.rs index 7d460b21cb0d..ac365a0968ec 100644 --- a/arrow/src/compute/kernels/concat_elements.rs +++ b/arrow/src/compute/kernels/concat_elements.rs @@ -75,7 +75,7 @@ pub fn concat_elements_utf8( output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); } - let builder = ArrayDataBuilder::new(GenericStringArray::::get_data_type()) + let builder = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) .len(left.len()) .add_buffer(output_offsets.finish()) .add_buffer(output_values.finish()) @@ -155,7 +155,7 @@ pub fn concat_elements_utf8_many( output_offsets.append(Offset::from_usize(output_values.len()).unwrap()); } - let builder = ArrayDataBuilder::new(GenericStringArray::::get_data_type()) + let builder = ArrayDataBuilder::new(GenericStringArray::::DATA_TYPE) .len(size) .add_buffer(output_offsets.finish()) .add_buffer(output_values.finish()) diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index 912733cf1f26..dca09a66a8cf 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -17,7 +17,6 @@ //! Defines sort kernel for `ArrayRef` -use crate::array::BasicDecimalArray; use crate::array::*; use crate::buffer::MutableBuffer; use crate::compute::take; @@ -145,7 +144,7 @@ pub fn sort_to_indices( let (v, n) = partition_validity(values); Ok(match values.data_type() { - DataType::Decimal(_, _) => sort_decimal(values, v, n, cmp, &options, limit), + DataType::Decimal128(_, _) => sort_decimal(values, v, n, cmp, &options, limit), DataType::Boolean => sort_boolean(values, v, n, &options, limit), DataType::Int8 => { sort_primitive::(values, v, n, cmp, &options, limit) diff --git a/arrow/src/compute/kernels/substring.rs b/arrow/src/compute/kernels/substring.rs index 024f5633fef4..5190d0bf0b67 100644 --- a/arrow/src/compute/kernels/substring.rs +++ b/arrow/src/compute/kernels/substring.rs @@ -205,7 +205,7 @@ pub fn substring_by_char( }); let data = unsafe { ArrayData::new_unchecked( - GenericStringArray::::get_data_type(), + GenericStringArray::::DATA_TYPE, array.len(), None, array @@ -294,7 +294,7 @@ fn binary_substring( let data = unsafe { ArrayData::new_unchecked( - GenericBinaryArray::::get_data_type(), + GenericBinaryArray::::DATA_TYPE, array.len(), None, array @@ -425,7 +425,7 @@ fn utf8_substring( let data = unsafe { ArrayData::new_unchecked( - GenericStringArray::::get_data_type(), + GenericStringArray::::DATA_TYPE, array.len(), None, array @@ -587,7 +587,7 @@ mod tests { // set the first and third element to be valid let bitmap = [0b101_u8]; - let data = ArrayData::builder(GenericBinaryArray::::get_data_type()) + let data = ArrayData::builder(GenericBinaryArray::::DATA_TYPE) .len(2) .add_buffer(Buffer::from_slice_ref(offsets)) .add_buffer(Buffer::from_iter(values)) @@ -814,7 +814,7 @@ mod tests { // set the first and third element to be valid let bitmap = [0b101_u8]; - let data = ArrayData::builder(GenericStringArray::::get_data_type()) + let data = ArrayData::builder(GenericStringArray::::DATA_TYPE) .len(2) .add_buffer(Buffer::from_slice_ref(offsets)) .add_buffer(Buffer::from(values)) @@ -939,7 +939,7 @@ mod tests { // set the first and third element to be valid let bitmap = [0b101_u8]; - let data = ArrayData::builder(GenericStringArray::::get_data_type()) + let data = ArrayData::builder(GenericStringArray::::DATA_TYPE) .len(2) .add_buffer(Buffer::from_slice_ref(offsets)) .add_buffer(Buffer::from(values)) diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 6c217a3d8a29..fb8f75651882 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -19,8 +19,6 @@ use std::{ops::AddAssign, sync::Arc}; -use crate::array::BasicDecimalArray; - use crate::buffer::{Buffer, MutableBuffer}; use crate::compute::util::{ take_value_indices_from_fixed_size_list, take_value_indices_from_list, @@ -148,7 +146,7 @@ where let values = values.as_any().downcast_ref::().unwrap(); Ok(Arc::new(take_boolean(values, indices)?)) } - DataType::Decimal(_, _) => { + DataType::Decimal128(_, _) => { let decimal_values = values.as_any().downcast_ref::().unwrap(); Ok(Arc::new(take_decimal128(decimal_values, indices)?)) @@ -614,23 +612,41 @@ where let mut output_buffer = MutableBuffer::new_null(len); let output_slice = output_buffer.as_slice_mut(); - indices - .iter() - .enumerate() - .try_for_each::<_, Result<()>>(|(i, index)| { - if let Some(index) = index { - let index = ToPrimitive::to_usize(&index).ok_or_else(|| { + let indices_has_nulls = indices.null_count() > 0; + + if indices_has_nulls { + indices + .iter() + .enumerate() + .try_for_each::<_, Result<()>>(|(i, index)| { + if let Some(index) = index { + let index = ToPrimitive::to_usize(&index).ok_or_else(|| { + ArrowError::ComputeError("Cast to usize failed".to_string()) + })?; + + if bit_util::get_bit(values_slice, values_offset + index) { + bit_util::set_bit(output_slice, i); + } + } + + Ok(()) + })?; + } else { + indices + .values() + .iter() + .enumerate() + .try_for_each::<_, Result<()>>(|(i, index)| { + let index = ToPrimitive::to_usize(index).ok_or_else(|| { ArrowError::ComputeError("Cast to usize failed".to_string()) })?; if bit_util::get_bit(values_slice, values_offset + index) { bit_util::set_bit(output_slice, i); } - } - - Ok(()) - })?; - + Ok(()) + })?; + } Ok(output_buffer.into()) } @@ -771,12 +787,11 @@ where }; } - let array_data = - ArrayData::builder(GenericStringArray::::get_data_type()) - .len(data_len) - .add_buffer(offsets_buffer.into()) - .add_buffer(values.into()) - .null_bit_buffer(nulls); + let array_data = ArrayData::builder(GenericStringArray::::DATA_TYPE) + .len(data_len) + .add_buffer(offsets_buffer.into()) + .add_buffer(values.into()) + .null_bit_buffer(nulls); let array_data = unsafe { array_data.build_unchecked() }; diff --git a/arrow/src/compute/kernels/zip.rs b/arrow/src/compute/kernels/zip.rs index 0ee8e47bede0..c28529cf6762 100644 --- a/arrow/src/compute/kernels/zip.rs +++ b/arrow/src/compute/kernels/zip.rs @@ -44,7 +44,7 @@ pub fn zip( let falsy = falsy.data(); let truthy = truthy.data(); - let mut mutable = MutableArrayData::new(vec![&*truthy, &*falsy], false, truthy.len()); + let mut mutable = MutableArrayData::new(vec![truthy, falsy], false, truthy.len()); // the SlicesIterator slices only the true values. So the gaps left by this iterator we need to // fill with falsy values diff --git a/arrow/src/compute/util.rs b/arrow/src/compute/util.rs index 29a90b65c237..974af9593e36 100644 --- a/arrow/src/compute/util.rs +++ b/arrow/src/compute/util.rs @@ -351,9 +351,7 @@ pub(super) mod tests { T: ArrowPrimitiveType, PrimitiveArray: From>>, { - use std::any::TypeId; - - let mut offset = vec![0]; + let mut offset = vec![S::zero()]; let mut values = vec![]; let list_len = data.len(); @@ -367,34 +365,18 @@ pub(super) mod tests { list_null_count += 1; bit_util::unset_bit(list_bitmap.as_slice_mut(), idx); } - offset.push(values.len() as i64); + offset.push(S::from_usize(values.len()).unwrap()); } let value_data = PrimitiveArray::::from(values).into_data(); - let (list_data_type, value_offsets) = if TypeId::of::() == TypeId::of::() - { - ( - DataType::List(Box::new(Field::new( - "item", - T::DATA_TYPE, - list_null_count == 0, - ))), - Buffer::from_slice_ref( - &offset.into_iter().map(|x| x as i32).collect::>(), - ), - ) - } else if TypeId::of::() == TypeId::of::() { - ( - DataType::LargeList(Box::new(Field::new( - "item", - T::DATA_TYPE, - list_null_count == 0, - ))), - Buffer::from_slice_ref(&offset), - ) - } else { - unreachable!() - }; + let (list_data_type, value_offsets) = ( + GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new(Field::new( + "item", + T::DATA_TYPE, + list_null_count == 0, + ))), + Buffer::from_slice_ref(&offset), + ); let list_data = ArrayData::builder(list_data_type) .len(list_len) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index d00bd729c096..ac26b377ffe1 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -544,7 +544,7 @@ fn parse( let field = &fields[i]; match field.data_type() { DataType::Boolean => build_boolean_array(line_number, rows, i), - DataType::Decimal(precision, scale) => { + DataType::Decimal128(precision, scale) => { build_decimal_array(line_number, rows, i, *precision, *scale) } DataType::Int8 => { @@ -776,8 +776,14 @@ fn parse_decimal_with_parameter(s: &str, precision: usize, scale: usize) -> Resu if negative { result = result.neg(); } - validate_decimal_precision(result, precision) - .map_err(|e| ArrowError::ParseError(format!("parse decimal overflow: {}", e))) + + match validate_decimal_precision(result, precision) { + Ok(_) => Ok(result), + Err(e) => Err(ArrowError::ParseError(format!( + "parse decimal overflow: {}", + e + ))), + } } else { Err(ArrowError::ParseError(format!( "can't parse the string value {} to decimal", @@ -1116,7 +1122,6 @@ mod tests { use std::io::{Cursor, Write}; use tempfile::NamedTempFile; - use crate::array::BasicDecimalArray; use crate::array::*; use crate::compute::cast; use crate::datatypes::Field; @@ -1206,8 +1211,8 @@ mod tests { fn test_csv_reader_with_decimal() { let schema = Schema::new(vec![ Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Decimal(38, 6), false), - Field::new("lng", DataType::Decimal(38, 6), false), + Field::new("lat", DataType::Decimal128(38, 6), false), + Field::new("lng", DataType::Decimal128(38, 6), false), ]); let file = File::open("test/data/decimal_test.csv").unwrap(); diff --git a/arrow/src/csv/writer.rs b/arrow/src/csv/writer.rs index 6735d9668560..7097706ba5f3 100644 --- a/arrow/src/csv/writer.rs +++ b/arrow/src/csv/writer.rs @@ -27,8 +27,6 @@ //! use arrow::csv; //! use arrow::datatypes::*; //! use arrow::record_batch::RecordBatch; -//! use arrow::util::test_util::get_temp_file; -//! use std::fs::File; //! use std::sync::Arc; //! //! let schema = Schema::new(vec![ @@ -56,9 +54,9 @@ //! ) //! .unwrap(); //! -//! let file = get_temp_file("out.csv", &[]); +//! let mut output = Vec::with_capacity(1024); //! -//! let mut writer = csv::Writer::new(file); +//! let mut writer = csv::Writer::new(&mut output); //! let batches = vec![&batch, &batch]; //! for batch in batches { //! writer.write(batch).unwrap(); @@ -223,7 +221,7 @@ impl Writer { DataType::Timestamp(time_unit, time_zone) => { self.handle_timestamp(time_unit, time_zone.as_ref(), row_index, col)? } - DataType::Decimal(..) => make_string_from_decimal(col, row_index)?, + DataType::Decimal128(..) => make_string_from_decimal(col, row_index)?, t => { // List and Struct arrays not supported by the writer, any // other type needs to be implemented diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index 429a94f24b9c..97ddc0c4a612 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -15,13 +15,15 @@ // specific language governing permissions and limitations // under the License. -use num::{BigInt, Num, ToPrimitive}; +use num::BigInt; +use std::cmp::Ordering; use std::fmt; use serde_derive::{Deserialize, Serialize}; use serde_json::{json, Value, Value::String as VString}; use crate::error::{ArrowError, Result}; +use crate::util::decimal::singed_cmp_le_bytes; use super::Field; @@ -189,14 +191,19 @@ pub enum DataType { /// This type mostly used to represent low cardinality string /// arrays or a limited set of primitive types as integers. Dictionary(Box, Box), - /// Exact decimal value with precision and scale + /// Exact 128-bit width decimal value with precision and scale + /// + /// * precision is the total number of digits + /// * scale is the number of digits past the decimal + /// + /// For example the number 123.45 has precision 5 and scale 2. + Decimal128(usize, usize), + /// Exact 256-bit width decimal value with precision and scale /// /// * precision is the total number of digits /// * scale is the number of digits past the decimal /// /// For example the number 123.45 has precision 5 and scale 2. - Decimal(usize, usize), - /// Exact decimal value with 256 bits width Decimal256(usize, usize), /// A Map is a logical nested type that is represented as /// @@ -258,8 +265,628 @@ impl fmt::Display for DataType { } } +// MAX decimal256 value of little-endian format for each precision. +// Each element is the max value of signed 256-bit integer for the specified precision which +// is encoded to the 32-byte width format of little-endian. +pub(crate) const MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ + [ + 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, + ], + [ + 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, + ], + [ + 231, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 15, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 159, 134, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 63, 66, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + ], + [ + 127, 150, 152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 224, 245, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 201, 154, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 227, 11, 84, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 231, 118, 72, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 15, 165, 212, 232, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 159, 114, 78, 24, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 63, 122, 16, 243, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 127, 198, 164, 126, 141, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 192, 111, 242, 134, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 137, 93, 120, 69, 99, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 99, 167, 179, 182, 224, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 231, 137, 4, 35, 199, 138, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 15, 99, 45, 94, 199, 107, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 159, 222, 197, 173, 201, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 63, 178, 186, 201, 224, 25, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 127, 246, 74, 225, 199, 2, 45, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 160, 237, 204, 206, 27, 194, 211, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 73, 72, 1, 20, 22, 149, 69, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 227, 210, 12, 200, 220, 210, 183, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 231, 60, 128, 208, 159, 60, 46, 59, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 15, 97, 2, 37, 62, 94, 206, 79, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 159, 202, 23, 114, 109, 174, 15, 30, 67, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 63, 234, 237, 116, 70, 208, 156, 44, 159, 12, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 127, 38, 75, 145, 192, 34, 32, 190, 55, 126, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 128, 239, 172, 133, 91, 65, 109, 45, 238, 4, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 9, 91, 193, 56, 147, 141, 68, 198, 77, 49, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 99, 142, 141, 55, 192, 135, 173, 190, 9, 237, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 231, 143, 135, 43, 130, 77, 199, 114, 97, 66, 19, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 15, 159, 75, 179, 21, 7, 201, 123, 206, 151, 192, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 159, 54, 244, 0, 217, 70, 218, 213, 16, 238, 133, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 63, 34, 138, 9, 122, 196, 134, 90, 168, 76, 59, 75, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 127, 86, 101, 95, 196, 172, 67, 137, 147, 254, 80, 240, 2, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 96, 245, 185, 171, 191, 164, 92, 195, 241, 41, 99, 29, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 201, 149, 67, 181, 124, 111, 158, 161, 113, 163, 223, + 37, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 227, 217, 163, 20, 223, 90, 48, 80, 112, 98, 188, 122, + 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 231, 130, 102, 206, 182, 140, 227, 33, 99, 216, 91, 203, + 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 15, 29, 1, 16, 36, 127, 227, 82, 223, 115, 150, 241, + 123, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 159, 34, 11, 160, 104, 247, 226, 60, 185, 134, 224, 111, + 215, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 63, 90, 111, 64, 22, 170, 221, 96, 60, 67, 197, 94, 106, + 192, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 127, 134, 89, 132, 222, 164, 168, 200, 91, 160, 180, + 179, 39, 132, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 64, 127, 43, 177, 112, 150, 214, 149, 67, 14, 5, + 141, 41, 175, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 137, 248, 178, 235, 102, 224, 97, 218, 163, 142, + 50, 130, 159, 215, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 99, 181, 253, 52, 5, 196, 210, 135, 102, 146, 249, + 21, 59, 108, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 231, 21, 233, 17, 52, 168, 59, 78, 1, 184, 191, + 219, 78, 58, 172, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 15, 219, 26, 179, 8, 146, 84, 14, 13, 48, 125, 149, + 20, 71, 186, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 159, 142, 12, 255, 86, 180, 77, 143, 130, 224, 227, + 214, 205, 198, 70, 11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 63, 146, 125, 246, 101, 11, 9, 153, 25, 197, 230, + 100, 10, 196, 195, 112, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 127, 182, 231, 160, 251, 113, 90, 250, 255, 178, 3, + 241, 103, 168, 165, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 32, 13, 73, 212, 115, 136, 199, 255, 253, 36, + 106, 15, 148, 120, 12, 20, 4, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 73, 131, 218, 74, 134, 84, 203, 253, 235, 113, + 37, 154, 200, 181, 124, 200, 40, 0, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 227, 32, 137, 236, 62, 77, 241, 233, 55, 115, + 118, 5, 214, 25, 223, 212, 151, 1, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 231, 72, 91, 61, 117, 4, 109, 35, 47, 128, + 160, 54, 92, 2, 183, 80, 238, 15, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 15, 217, 144, 101, 148, 44, 66, 98, 215, 1, + 69, 34, 154, 23, 38, 39, 79, 159, 0, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 159, 122, 168, 247, 203, 189, 149, 214, 105, + 18, 178, 86, 5, 236, 124, 135, 23, 57, 6, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 63, 202, 148, 172, 247, 105, 217, 97, 34, 184, + 244, 98, 53, 56, 225, 74, 235, 58, 62, 0, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 127, 230, 207, 189, 172, 35, 126, 210, 87, 49, + 143, 221, 21, 50, 204, 236, 48, 77, 110, 2, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 0, 31, 106, 191, 100, 237, 56, 110, 237, + 151, 167, 218, 244, 249, 63, 233, 3, 79, 24, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 9, 54, 37, 122, 239, 69, 57, 78, 70, 239, + 139, 138, 144, 195, 127, 28, 39, 22, 243, 0, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 99, 28, 116, 197, 90, 187, 60, 14, 191, + 88, 119, 105, 165, 163, 253, 28, 135, 221, 126, 9, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 231, 27, 137, 182, 139, 81, 95, 142, 118, + 119, 169, 30, 118, 100, 232, 33, 71, 167, 244, 94, 0, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 15, 23, 91, 33, 117, 47, 185, 143, 161, + 170, 158, 50, 157, 236, 19, 83, 199, 136, 142, 181, 3, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 159, 230, 142, 77, 147, 218, 59, 157, 79, + 170, 50, 250, 35, 62, 199, 62, 201, 87, 145, 23, 37, 0, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 63, 2, 149, 7, 193, 137, 86, 36, 28, 167, + 250, 197, 103, 109, 200, 115, 220, 109, 173, 235, 114, 1, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 127, 22, 210, 75, 138, 97, 97, 107, 25, + 135, 202, 187, 13, 70, 212, 133, 156, 74, 198, 52, 125, 14, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 224, 52, 246, 102, 207, 205, 49, + 254, 70, 233, 85, 137, 188, 74, 58, 29, 234, 190, 15, 228, 144, 0, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 201, 16, 158, 5, 26, 10, 242, 237, + 197, 28, 91, 93, 93, 235, 70, 36, 37, 117, 157, 232, 168, 5, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 227, 167, 44, 56, 4, 101, 116, 75, + 187, 31, 143, 165, 165, 49, 197, 106, 115, 147, 38, 22, 153, 56, 0, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 231, 142, 190, 49, 42, 242, 139, + 242, 80, 61, 151, 119, 120, 240, 179, 43, 130, 194, 129, 221, 250, 53, 2, + ], + [ + 255, 255, 255, 255, 255, 255, 255, 255, 255, 15, 149, 113, 241, 165, 117, 119, + 121, 41, 101, 232, 171, 180, 100, 7, 181, 21, 153, 17, 167, 204, 27, 22, + ], +]; + +// MIN decimal256 value of little-endian format for each precision. +// Each element is the min value of signed 256-bit integer for the specified precision which +// is encoded to the 76-byte width format of little-endian. +pub(crate) const MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION: [[u8; 32]; 76] = [ + [ + 247, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 157, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 25, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 241, 216, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 97, 121, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 193, 189, 240, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 129, 105, 103, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 31, 10, 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 54, 101, 196, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 28, 244, 171, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 24, 137, 183, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 240, 90, 43, 23, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 96, 141, 177, 231, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 192, 133, 239, 12, 165, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 128, 57, 91, 129, 114, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 63, 144, 13, 121, 220, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 118, 162, 135, 186, 156, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 156, 88, 76, 73, 31, 242, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 24, 118, 251, 220, 56, 117, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 240, 156, 210, 161, 56, 148, 250, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 96, 33, 58, 82, 54, 202, 201, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 192, 77, 69, 54, 31, 230, 225, 253, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 128, 9, 181, 30, 56, 253, 210, 234, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 95, 18, 51, 49, 228, 61, 44, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 182, 183, 254, 235, 233, 106, 186, 247, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 28, 45, 243, 55, 35, 45, 72, 173, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 24, 195, 127, 47, 96, 195, 209, 196, 252, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 240, 158, 253, 218, 193, 161, 49, 176, 223, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 96, 53, 232, 141, 146, 81, 240, 225, 188, 254, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 192, 21, 18, 139, 185, 47, 99, 211, 96, 243, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 128, 217, 180, 110, 63, 221, 223, 65, 200, 129, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 127, 16, 83, 122, 164, 190, 146, 210, 17, 251, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 246, 164, 62, 199, 108, 114, 187, 57, 178, 206, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 156, 113, 114, 200, 63, 120, 82, 65, 246, 18, 254, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 24, 112, 120, 212, 125, 178, 56, 141, 158, 189, 236, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 240, 96, 180, 76, 234, 248, 54, 132, 49, 104, 63, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 96, 201, 11, 255, 38, 185, 37, 42, 239, 17, 122, 248, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 192, 221, 117, 246, 133, 59, 121, 165, 87, 179, 196, 180, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 128, 169, 154, 160, 59, 83, 188, 118, 108, 1, 175, 15, 253, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 159, 10, 70, 84, 64, 91, 163, 60, 14, 214, 156, 226, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 54, 106, 188, 74, 131, 144, 97, 94, 142, 92, 32, 218, 254, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 28, 38, 92, 235, 32, 165, 207, 175, 143, 157, 67, 133, 244, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 24, 125, 153, 49, 73, 115, 28, 222, 156, 39, 164, 52, 141, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 240, 226, 254, 239, 219, 128, 28, 173, 32, 140, 105, 14, 132, 251, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 96, 221, 244, 95, 151, 8, 29, 195, 70, 121, 31, 144, 40, 211, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 192, 165, 144, 191, 233, 85, 34, 159, 195, 188, 58, 161, 149, 63, + 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 128, 121, 166, 123, 33, 91, 87, 55, 164, 95, 75, 76, 216, 123, + 238, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 191, 128, 212, 78, 143, 105, 41, 106, 188, 241, 250, 114, 214, + 80, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 118, 7, 77, 20, 153, 31, 158, 37, 92, 113, 205, 125, 96, 40, + 249, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 156, 74, 2, 203, 250, 59, 45, 120, 153, 109, 6, 234, 196, 147, + 187, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 24, 234, 22, 238, 203, 87, 196, 177, 254, 71, 64, 36, 177, 197, + 83, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 240, 36, 229, 76, 247, 109, 171, 241, 242, 207, 130, 106, 235, + 184, 69, 229, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 96, 113, 243, 0, 169, 75, 178, 112, 125, 31, 28, 41, 50, 57, + 185, 244, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 192, 109, 130, 9, 154, 244, 246, 102, 230, 58, 25, 155, 245, + 59, 60, 143, 245, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 128, 73, 24, 95, 4, 142, 165, 5, 0, 77, 252, 14, 152, 87, 90, + 152, 151, 255, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 223, 242, 182, 43, 140, 119, 56, 0, 2, 219, 149, 240, 107, + 135, 243, 235, 251, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 182, 124, 37, 181, 121, 171, 52, 2, 20, 142, 218, 101, 55, + 74, 131, 55, 215, 255, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 28, 223, 118, 19, 193, 178, 14, 22, 200, 140, 137, 250, 41, + 230, 32, 43, 104, 254, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 24, 183, 164, 194, 138, 251, 146, 220, 208, 127, 95, 201, + 163, 253, 72, 175, 17, 240, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 240, 38, 111, 154, 107, 211, 189, 157, 40, 254, 186, 221, + 101, 232, 217, 216, 176, 96, 255, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 96, 133, 87, 8, 52, 66, 106, 41, 150, 237, 77, 169, 250, 19, + 131, 120, 232, 198, 249, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 192, 53, 107, 83, 8, 150, 38, 158, 221, 71, 11, 157, 202, + 199, 30, 181, 20, 197, 193, 255, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 128, 25, 48, 66, 83, 220, 129, 45, 168, 206, 112, 34, 234, + 205, 51, 19, 207, 178, 145, 253, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 255, 224, 149, 64, 155, 18, 199, 145, 18, 104, 88, 37, + 11, 6, 192, 22, 252, 176, 231, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 246, 201, 218, 133, 16, 186, 198, 177, 185, 16, 116, 117, + 111, 60, 128, 227, 216, 233, 12, 255, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 156, 227, 139, 58, 165, 68, 195, 241, 64, 167, 136, 150, + 90, 92, 2, 227, 120, 34, 129, 246, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 24, 228, 118, 73, 116, 174, 160, 113, 137, 136, 86, 225, + 137, 155, 23, 222, 184, 88, 11, 161, 255, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 240, 232, 164, 222, 138, 208, 70, 112, 94, 85, 97, 205, + 98, 19, 236, 172, 56, 119, 113, 74, 252, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 96, 25, 113, 178, 108, 37, 196, 98, 176, 85, 205, 5, 220, + 193, 56, 193, 54, 168, 110, 232, 218, 255, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 192, 253, 106, 248, 62, 118, 169, 219, 227, 88, 5, 58, + 152, 146, 55, 140, 35, 146, 82, 20, 141, 254, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 128, 233, 45, 180, 117, 158, 158, 148, 230, 120, 53, 68, + 242, 185, 43, 122, 99, 181, 57, 203, 130, 241, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 31, 203, 9, 153, 48, 50, 206, 1, 185, 22, 170, 118, + 67, 181, 197, 226, 21, 65, 240, 27, 111, 255, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 54, 239, 97, 250, 229, 245, 13, 18, 58, 227, 164, 162, + 162, 20, 185, 219, 218, 138, 98, 23, 87, 250, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 28, 88, 211, 199, 251, 154, 139, 180, 68, 224, 112, + 90, 90, 206, 58, 149, 140, 108, 217, 233, 102, 199, 255, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 24, 113, 65, 206, 213, 13, 116, 13, 175, 194, 104, + 136, 135, 15, 76, 212, 125, 61, 126, 34, 5, 202, 253, + ], + [ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 106, 142, 14, 90, 138, 136, 134, 214, 154, 23, + 84, 75, 155, 248, 74, 234, 102, 238, 88, 51, 228, 233, + ], +]; + /// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value -/// that can be stored in [DataType::Decimal] value of precision `p` +/// that can be stored in [DataType::Decimal128] value of precision `p` pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 9, 99, @@ -301,51 +928,8 @@ pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ 99999999999999999999999999999999999999, ]; -/// `MAX_DECIMAL_FOR_LARGER_PRECISION[p]` holds the maximum integer value -/// that can be stored in [DataType::Decimal256] value of precision `p` > 38 -pub const MAX_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [ - "99999999999999999999999999999999999999", - "999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999999999", - "9999999999999999999999999999999999999999999999999999999999999999999999999", - "99999999999999999999999999999999999999999999999999999999999999999999999999", - "999999999999999999999999999999999999999999999999999999999999999999999999999", -]; - /// `MIN_DECIMAL_FOR_EACH_PRECISION[p]` holds the minimum `i128` value -/// that can be stored in a [DataType::Decimal] value of precision `p` +/// that can be stored in a [DataType::Decimal128] value of precision `p` pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -9, -99, @@ -387,53 +971,10 @@ pub const MIN_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [ -99999999999999999999999999999999999999, ]; -/// `MIN_DECIMAL_FOR_LARGER_PRECISION[p]` holds the minimum integer value -/// that can be stored in a [DataType::Decimal256] value of precision `p` > 38 -pub const MIN_DECIMAL_FOR_LARGER_PRECISION: [&str; 38] = [ - "-99999999999999999999999999999999999999", - "-999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999999999", - "-9999999999999999999999999999999999999999999999999999999999999999999999999", - "-99999999999999999999999999999999999999999999999999999999999999999999999999", - "-999999999999999999999999999999999999999999999999999999999999999999999999999", -]; - -/// The maximum precision for [DataType::Decimal] values +/// The maximum precision for [DataType::Decimal128] values pub const DECIMAL128_MAX_PRECISION: usize = 38; -/// The maximum scale for [DataType::Decimal] values +/// The maximum scale for [DataType::Decimal128] values pub const DECIMAL128_MAX_SCALE: usize = 38; /// The maximum precision for [DataType::Decimal256] values @@ -442,16 +983,18 @@ pub const DECIMAL256_MAX_PRECISION: usize = 76; /// The maximum scale for [DataType::Decimal256] values pub const DECIMAL256_MAX_SCALE: usize = 76; -/// The default scale for [DataType::Decimal] and [DataType::Decimal256] values +/// The default scale for [DataType::Decimal128] and [DataType::Decimal256] values pub const DECIMAL_DEFAULT_SCALE: usize = 10; /// Validates that the specified `i128` value can be properly /// interpreted as a Decimal number with precision `precision` #[inline] -pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Result { - // TODO: add validation logic for precision > 38 - if precision > 38 { - return Ok(value); +pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Result<()> { + if precision > DECIMAL128_MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "Max precision of a Decimal128 is {}, but got {}", + DECIMAL128_MAX_PRECISION, precision, + ))); } let max = MAX_DECIMAL_FOR_EACH_PRECISION[precision - 1]; @@ -459,65 +1002,51 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Resul if value > max { Err(ArrowError::InvalidArgumentError(format!( - "{} is too large to store in a Decimal of precision {}. Max is {}", + "{} is too large to store in a Decimal128 of precision {}. Max is {}", value, precision, max ))) } else if value < min { Err(ArrowError::InvalidArgumentError(format!( - "{} is too small to store in a Decimal of precision {}. Min is {}", + "{} is too small to store in a Decimal128 of precision {}. Min is {}", value, precision, min ))) } else { - Ok(value) + Ok(()) } } -/// Validates that the specified string value can be properly -/// interpreted as a Decimal256 number with precision `precision` +/// Validates that the specified `byte_array` of little-endian format +/// value can be properly interpreted as a Decimal256 number with precision `precision` #[inline] -pub(crate) fn validate_decimal256_precision( - value: &str, +pub(crate) fn validate_decimal256_precision_with_lt_bytes( + lt_value: &[u8], precision: usize, -) -> Result { - if precision > 38 { - let max_str = MAX_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1]; - let min_str = MIN_DECIMAL_FOR_LARGER_PRECISION[precision - 38 - 1]; - - let max = BigInt::from_str_radix(max_str, 10).unwrap(); - let min = BigInt::from_str_radix(min_str, 10).unwrap(); - - let value = BigInt::from_str_radix(value, 10).unwrap(); - if value > max { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too large to store in a Decimal256 of precision {}. Max is {}", - value, precision, max - ))) - } else if value < min { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too small to store in a Decimal256 of precision {}. Min is {}", - value, precision, min - ))) - } else { - Ok(value) - } +) -> Result<()> { + if precision > DECIMAL256_MAX_PRECISION { + return Err(ArrowError::InvalidArgumentError(format!( + "Max precision of a Decimal256 is {}, but got {}", + DECIMAL256_MAX_PRECISION, precision, + ))); + } + let max = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[precision - 1]; + let min = MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[precision - 1]; + + if singed_cmp_le_bytes(lt_value, &max) == Ordering::Greater { + Err(ArrowError::InvalidArgumentError(format!( + "{:?} is too large to store in a Decimal256 of precision {}. Max is {:?}", + BigInt::from_signed_bytes_le(lt_value), + precision, + BigInt::from_signed_bytes_le(&max) + ))) + } else if singed_cmp_le_bytes(lt_value, &min) == Ordering::Less { + Err(ArrowError::InvalidArgumentError(format!( + "{:?} is too small to store in a Decimal256 of precision {}. Min is {:?}", + BigInt::from_signed_bytes_le(lt_value), + precision, + BigInt::from_signed_bytes_le(&min) + ))) } else { - let max = MAX_DECIMAL_FOR_EACH_PRECISION[precision - 1]; - let min = MIN_DECIMAL_FOR_EACH_PRECISION[precision - 1]; - let value = BigInt::from_str_radix(value, 10).unwrap(); - - if value.to_i128().unwrap() > max { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too large to store in a Decimal256 of precision {}. Max is {}", - value, precision, max - ))) - } else if value.to_i128().unwrap() < min { - Err(ArrowError::InvalidArgumentError(format!( - "{} is too small to store in a Decimal256 of precision {}. Min is {}", - value, precision, min - ))) - } else { - Ok(value) - } + Ok(()) } } @@ -563,7 +1092,7 @@ impl DataType { }; if bit_width == 128 { - Ok(DataType::Decimal(precision, scale)) + Ok(DataType::Decimal128(precision, scale)) } else if bit_width == 256 { Ok(DataType::Decimal256(precision, scale)) } else { @@ -850,7 +1379,7 @@ impl DataType { TimeUnit::Nanosecond => "NANOSECOND", }}), DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), - DataType::Decimal(precision, scale) => { + DataType::Decimal128(precision, scale) => { json!({"name": "decimal", "precision": precision, "scale": scale, "bitWidth": 128}) } DataType::Decimal256(precision, scale) => { @@ -934,3 +1463,32 @@ impl DataType { } } } + +#[cfg(test)] +mod test { + use crate::datatypes::datatype::{ + MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, + MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION, + }; + use crate::util::decimal::Decimal256; + use num::{BigInt, Num}; + + #[test] + fn test_decimal256_min_max_for_precision() { + // The precision from 1 to 76 + let mut max_value = "9".to_string(); + let mut min_value = "-9".to_string(); + for i in 1..77 { + let max_decimal = + Decimal256::from(BigInt::from_str_radix(max_value.as_str(), 10).unwrap()); + let min_decimal = + Decimal256::from(BigInt::from_str_radix(min_value.as_str(), 10).unwrap()); + let max_bytes = MAX_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; + let min_bytes = MIN_DECIMAL_BYTES_FOR_LARGER_EACH_PRECISION[i - 1]; + max_value += "9"; + min_value += "9"; + assert_eq!(max_decimal.raw_value(), &max_bytes); + assert_eq!(min_decimal.raw_value(), &min_bytes); + } + } +} diff --git a/arrow/src/datatypes/ffi.rs b/arrow/src/datatypes/ffi.rs index 7ad468b5ed9e..60d285315c0b 100644 --- a/arrow/src/datatypes/ffi.rs +++ b/arrow/src/datatypes/ffi.rs @@ -108,7 +108,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "The decimal type requires an integer scale".to_string(), ) })?; - DataType::Decimal(parsed_precision, parsed_scale) + DataType::Decimal128(parsed_precision, parsed_scale) }, [precision, scale, bits] => { if *bits != "128" { @@ -124,7 +124,7 @@ impl TryFrom<&FFI_ArrowSchema> for DataType { "The decimal type requires an integer scale".to_string(), ) })?; - DataType::Decimal(parsed_precision, parsed_scale) + DataType::Decimal128(parsed_precision, parsed_scale) } _ => { return Err(ArrowError::CDataInterface(format!( @@ -253,7 +253,9 @@ fn get_format_string(dtype: &DataType) -> Result { DataType::LargeUtf8 => Ok("U".to_string()), DataType::FixedSizeBinary(num_bytes) => Ok(format!("w:{}", num_bytes)), DataType::FixedSizeList(_, num_elems) => Ok(format!("+w:{}", num_elems)), - DataType::Decimal(precision, scale) => Ok(format!("d:{},{}", precision, scale)), + DataType::Decimal128(precision, scale) => { + Ok(format!("d:{},{}", precision, scale)) + } DataType::Date32 => Ok("tdD".to_string()), DataType::Date64 => Ok("tdm".to_string()), DataType::Time32(TimeUnit::Second) => Ok("tts".to_string()), diff --git a/arrow/src/datatypes/field.rs b/arrow/src/datatypes/field.rs index 42fb8ce1db9e..f50ebadd5e7c 100644 --- a/arrow/src/datatypes/field.rs +++ b/arrow/src/datatypes/field.rs @@ -209,23 +209,17 @@ impl Field { } fn _fields<'a>(&'a self, dt: &'a DataType) -> Vec<&Field> { - let mut collected_fields = vec![]; - match dt { DataType::Struct(fields) | DataType::Union(fields, _, _) => { - collected_fields.extend(fields.iter().flat_map(|f| f.fields())) + fields.iter().flat_map(|f| f.fields()).collect() } DataType::List(field) | DataType::LargeList(field) | DataType::FixedSizeList(field, _) - | DataType::Map(field, _) => collected_fields.extend(field.fields()), - DataType::Dictionary(_, value_field) => { - collected_fields.append(&mut self._fields(value_field.as_ref())) - } - _ => (), + | DataType::Map(field, _) => field.fields(), + DataType::Dictionary(_, value_field) => self._fields(value_field.as_ref()), + _ => vec![], } - - collected_fields } /// Returns a vector containing all (potentially nested) `Field` instances selected by the @@ -506,12 +500,10 @@ impl Field { pub fn to_json(&self) -> Value { let children: Vec = match self.data_type() { DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), - DataType::List(field) => vec![field.to_json()], - DataType::LargeList(field) => vec![field.to_json()], - DataType::FixedSizeList(field, _) => vec![field.to_json()], - DataType::Map(field, _) => { - vec![field.to_json()] - } + DataType::List(field) + | DataType::LargeList(field) + | DataType::FixedSizeList(field, _) + | DataType::Map(field, _) => vec![field.to_json()], _ => vec![], }; match self.data_type() { @@ -550,6 +542,17 @@ impl Field { /// assert!(field.is_nullable()); /// ``` pub fn try_merge(&mut self, from: &Field) -> Result<()> { + if from.dict_id != self.dict_id { + return Err(ArrowError::SchemaError( + "Fail to merge schema Field due to conflicting dict_id".to_string(), + )); + } + if from.dict_is_ordered != self.dict_is_ordered { + return Err(ArrowError::SchemaError( + "Fail to merge schema Field due to conflicting dict_is_ordered" + .to_string(), + )); + } // merge metadata match (self.metadata(), from.metadata()) { (Some(self_metadata), Some(from_metadata)) => { @@ -572,31 +575,16 @@ impl Field { } _ => {} } - if from.dict_id != self.dict_id { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_id".to_string(), - )); - } - if from.dict_is_ordered != self.dict_is_ordered { - return Err(ArrowError::SchemaError( - "Fail to merge schema Field due to conflicting dict_is_ordered" - .to_string(), - )); - } match &mut self.data_type { DataType::Struct(nested_fields) => match &from.data_type { DataType::Struct(from_nested_fields) => { for from_field in from_nested_fields { - let mut is_new_field = true; - for self_field in nested_fields.iter_mut() { - if self_field.name != from_field.name { - continue; - } - is_new_field = false; - self_field.try_merge(from_field)?; - } - if is_new_field { - nested_fields.push(from_field.clone()); + match nested_fields + .iter_mut() + .find(|self_field| self_field.name == from_field.name) + { + Some(self_field) => self_field.try_merge(from_field)?, + None => nested_fields.push(from_field.clone()), } } } @@ -675,7 +663,7 @@ impl Field { | DataType::FixedSizeBinary(_) | DataType::Utf8 | DataType::LargeUtf8 - | DataType::Decimal(_, _) + | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => { if self.data_type != from.data_type { return Err(ArrowError::SchemaError( @@ -685,9 +673,7 @@ impl Field { } } } - if from.nullable { - self.nullable = from.nullable; - } + self.nullable |= from.nullable; Ok(()) } @@ -698,41 +684,25 @@ impl Field { /// * self.metadata is a superset of other.metadata /// * all other fields are equal pub fn contains(&self, other: &Field) -> bool { - if self.name != other.name - || self.data_type != other.data_type - || self.dict_id != other.dict_id - || self.dict_is_ordered != other.dict_is_ordered - { - return false; - } - - if self.nullable != other.nullable && !self.nullable { - return false; - } - + self.name == other.name + && self.data_type == other.data_type + && self.dict_id == other.dict_id + && self.dict_is_ordered == other.dict_is_ordered + // self need to be nullable or both of them are not nullable + && (self.nullable || !other.nullable) // make sure self.metadata is a superset of other.metadata - match (&self.metadata, &other.metadata) { - (None, Some(_)) => { - return false; - } + && match (&self.metadata, &other.metadata) { + (_, None) => true, + (None, Some(_)) => false, (Some(self_meta), Some(other_meta)) => { - for (k, v) in other_meta.iter() { + other_meta.iter().all(|(k, v)| { match self_meta.get(k) { - Some(s) => { - if s != v { - return false; - } - } - None => { - return false; - } + Some(s) => s == v, + None => false } - } + }) } - _ => {} } - - true } } @@ -745,7 +715,7 @@ impl std::fmt::Display for Field { #[cfg(test)] mod test { - use super::{DataType, Field}; + use super::*; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; @@ -840,4 +810,72 @@ mod test { assert_ne!(dict1, dict2); assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2)); } + + #[test] + fn test_contains_reflexivity() { + let mut field = Field::new("field1", DataType::Float16, false); + field.set_metadata(Some(BTreeMap::from([ + (String::from("k0"), String::from("v0")), + (String::from("k1"), String::from("v1")), + ]))); + assert!(field.contains(&field)) + } + + #[test] + fn test_contains_transitivity() { + let child_field = Field::new("child1", DataType::Float16, false); + + let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false); + field1.set_metadata(Some(BTreeMap::from([( + String::from("k1"), + String::from("v1"), + )]))); + + let mut field2 = Field::new("field1", DataType::Struct(vec![]), true); + field2.set_metadata(Some(BTreeMap::from([( + String::from("k2"), + String::from("v2"), + )]))); + field2.try_merge(&field1).unwrap(); + + let mut field3 = Field::new("field1", DataType::Struct(vec![]), false); + field3.set_metadata(Some(BTreeMap::from([( + String::from("k3"), + String::from("v3"), + )]))); + field3.try_merge(&field2).unwrap(); + + assert!(field2.contains(&field1)); + assert!(field3.contains(&field2)); + assert!(field3.contains(&field1)); + + assert!(!field1.contains(&field2)); + assert!(!field1.contains(&field3)); + assert!(!field2.contains(&field3)); + } + + #[test] + fn test_contains_nullable() { + let field1 = Field::new("field1", DataType::Boolean, true); + let field2 = Field::new("field1", DataType::Boolean, false); + assert!(field1.contains(&field2)); + assert!(!field2.contains(&field1)); + } + + #[test] + fn test_contains_must_have_same_fields() { + let child_field1 = Field::new("child1", DataType::Float16, false); + let child_field2 = Field::new("child2", DataType::Float16, false); + + let field1 = + Field::new("field1", DataType::Struct(vec![child_field1.clone()]), true); + let field2 = Field::new( + "field1", + DataType::Struct(vec![child_field1, child_field2]), + true, + ); + + assert!(!field1.contains(&field2)); + assert!(!field2.contains(&field1)); + } } diff --git a/arrow/src/datatypes/mod.rs b/arrow/src/datatypes/mod.rs index c082bc64c660..1f98a4afa918 100644 --- a/arrow/src/datatypes/mod.rs +++ b/arrow/src/datatypes/mod.rs @@ -37,8 +37,10 @@ pub use types::*; mod datatype; pub use datatype::*; mod delta; -mod ffi; +#[cfg(feature = "ffi")] +mod ffi; +#[cfg(feature = "ffi")] pub use ffi::*; /// A reference-counted reference to a [`Schema`](crate::datatypes::Schema). @@ -1481,23 +1483,31 @@ mod tests { .is_err()); // incompatible metadata should throw error - assert!(Schema::try_merge(vec![ + let res = Schema::try_merge(vec![ Schema::new_with_metadata( vec![Field::new("first_name", DataType::Utf8, false)], - [("foo".to_string(), "bar".to_string()),] + [("foo".to_string(), "bar".to_string())] .iter() .cloned() - .collect::>() + .collect::>(), ), Schema::new_with_metadata( vec![Field::new("last_name", DataType::Utf8, false)], - [("foo".to_string(), "baz".to_string()),] + [("foo".to_string(), "baz".to_string())] .iter() .cloned() - .collect::>() - ) + .collect::>(), + ), ]) - .is_err()); + .unwrap_err(); + + let expected = "Fail to merge schema due to conflicting metadata. Key 'foo' has different values 'bar' and 'baz'"; + assert!( + res.to_string().contains(expected), + "Could not find expected string '{}' in '{}'", + expected, + res + ); Ok(()) } diff --git a/arrow/src/datatypes/schema.rs b/arrow/src/datatypes/schema.rs index 1574b165462f..3ab627c687ea 100644 --- a/arrow/src/datatypes/schema.rs +++ b/arrow/src/datatypes/schema.rs @@ -33,11 +33,11 @@ use super::Field; /// memory layout. #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub struct Schema { - pub(crate) fields: Vec, + pub fields: Vec, /// A map of key-value pairs containing additional meta data. #[serde(skip_serializing_if = "HashMap::is_empty")] #[serde(default)] - pub(crate) metadata: HashMap, + pub metadata: HashMap, } impl Schema { @@ -149,10 +149,11 @@ impl Schema { // merge metadata if let Some(old_val) = merged.metadata.get(&key) { if old_val != &value { - return Err(ArrowError::SchemaError( - "Fail to merge schema due to conflicting metadata." - .to_string(), - )); + return Err(ArrowError::SchemaError(format!( + "Fail to merge schema due to conflicting metadata. \ + Key '{}' has different values '{}' and '{}'", + key, old_val, value + ))); } } merged.metadata.insert(key, value); diff --git a/arrow/src/datatypes/types.rs b/arrow/src/datatypes/types.rs index 223f969285ec..8c0ac7b3fd6e 100644 --- a/arrow/src/datatypes/types.rs +++ b/arrow/src/datatypes/types.rs @@ -232,6 +232,18 @@ impl IntervalDayTimeType { days: i32, millis: i32, ) -> ::Native { + /* + https://github.com/apache/arrow/blob/02c8598d264c839a5b5cf3109bfd406f3b8a6ba5/cpp/src/arrow/type.h#L1433 + struct DayMilliseconds { + int32_t days = 0; + int32_t milliseconds = 0; + ... + } + 64 56 48 40 32 24 16 8 0 + +-------+-------+-------+-------+-------+-------+-------+-------+ + | days | milliseconds | + +-------+-------+-------+-------+-------+-------+-------+-------+ + */ let m = millis as u64 & u32::MAX as u64; let d = (days as u64 & u32::MAX as u64) << 32; (m | d) as ::Native @@ -264,9 +276,21 @@ impl IntervalMonthDayNanoType { days: i32, nanos: i64, ) -> ::Native { - let m = months as u128 & u32::MAX as u128; - let d = (days as u128 & u32::MAX as u128) << 32; - let n = (nanos as u128) << 64; + /* + https://github.com/apache/arrow/blob/02c8598d264c839a5b5cf3109bfd406f3b8a6ba5/cpp/src/arrow/type.h#L1475 + struct MonthDayNanos { + int32_t months; + int32_t days; + int64_t nanoseconds; + } + 128 112 96 80 64 48 32 16 0 + +-------+-------+-------+-------+-------+-------+-------+-------+ + | months | days | nanos | + +-------+-------+-------+-------+-------+-------+-------+-------+ + */ + let m = (months as u128 & u32::MAX as u128) << 96; + let d = (days as u128 & u32::MAX as u128) << 64; + let n = nanos as u128 & u64::MAX as u128; (m | d | n) as ::Native } @@ -278,9 +302,9 @@ impl IntervalMonthDayNanoType { pub fn to_parts( i: ::Native, ) -> (i32, i32, i64) { - let nanos = (i >> 64) as i64; - let days = (i >> 32) as i32; - let months = i as i32; + let months = (i >> 96) as i32; + let days = (i >> 64) as i32; + let nanos = i as i64; (months, days, nanos) } } @@ -430,3 +454,44 @@ impl Date64Type { Date64Type::from_naive_date(res) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn month_day_nano_should_roundtrip() { + let value = IntervalMonthDayNanoType::make_value(1, 2, 3); + assert_eq!(IntervalMonthDayNanoType::to_parts(value), (1, 2, 3)); + } + + #[test] + fn month_day_nano_should_roundtrip_neg() { + let value = IntervalMonthDayNanoType::make_value(-1, -2, -3); + assert_eq!(IntervalMonthDayNanoType::to_parts(value), (-1, -2, -3)); + } + + #[test] + fn day_time_should_roundtrip() { + let value = IntervalDayTimeType::make_value(1, 2); + assert_eq!(IntervalDayTimeType::to_parts(value), (1, 2)); + } + + #[test] + fn day_time_should_roundtrip_neg() { + let value = IntervalDayTimeType::make_value(-1, -2); + assert_eq!(IntervalDayTimeType::to_parts(value), (-1, -2)); + } + + #[test] + fn year_month_should_roundtrip() { + let value = IntervalYearMonthType::make_value(1, 2); + assert_eq!(IntervalYearMonthType::to_months(value), 14); + } + + #[test] + fn year_month_should_roundtrip_neg() { + let value = IntervalYearMonthType::make_value(-1, -2); + assert_eq!(IntervalYearMonthType::to_months(value), -14); + } +} diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index 2d95b4ea639a..528f3adc2d84 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -29,14 +29,16 @@ //! # use arrow::array::{Int32Array, Array, ArrayData, export_array_into_raw, make_array, make_array_from_raw}; //! # use arrow::error::{Result, ArrowError}; //! # use arrow::compute::kernels::arithmetic; -//! # use arrow::ffi::{FFI_ArrowArray, FFI_ArrowSchema}; +//! # use arrow::ffi::{ArrowArray, FFI_ArrowArray, FFI_ArrowSchema}; //! # use std::convert::TryFrom; //! # fn main() -> Result<()> { //! // create an array natively //! let array = Int32Array::from(vec![Some(1), None, Some(3)]); //! //! // export it -//! let (array_ptr, schema_ptr) = array.to_raw()?; +//! +//! let ffi_array = ArrowArray::try_new(array.data().clone())?; +//! let (array_ptr, schema_ptr) = ArrowArray::into_raw(ffi_array); //! //! // consumed and used by something else... //! @@ -322,7 +324,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Int64, 1) | (DataType::Date64, 1) | (DataType::Time64(_), 1) => size_of::() * 8, (DataType::Float32, 1) => size_of::() * 8, (DataType::Float64, 1) => size_of::() * 8, - (DataType::Decimal(..), 1) => size_of::() * 8, + (DataType::Decimal128(..), 1) => size_of::() * 8, (DataType::Timestamp(..), 1) => size_of::() * 8, (DataType::Duration(..), 1) => size_of::() * 8, // primitive types have a single buffer @@ -337,7 +339,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Int64, _) | (DataType::Date64, _) | (DataType::Time64(_), _) | (DataType::Float32, _) | (DataType::Float64, _) | - (DataType::Decimal(..), _) | + (DataType::Decimal128(..), _) | (DataType::Timestamp(..), _) | (DataType::Duration(..), _) => { return Err(ArrowError::CDataInterface(format!( @@ -456,7 +458,7 @@ struct ArrayPrivateData { impl FFI_ArrowArray { /// creates a new `FFI_ArrowArray` from existing data. - /// # Safety + /// # Memory Leaks /// This method releases `buffers`. Consumers of this struct *must* call `release` before /// releasing this struct, or contents in `buffers` leak. pub fn new(data: &ArrayData) -> Self { @@ -836,10 +838,11 @@ impl<'a> ArrowArrayRef for ArrowArrayChild<'a> { impl ArrowArray { /// creates a new `ArrowArray`. This is used to export to the C Data Interface. - /// # Safety - /// See safety of [ArrowArray] - #[allow(clippy::too_many_arguments)] - pub unsafe fn try_new(data: ArrayData) -> Result { + /// + /// # Memory Leaks + /// This method releases `buffers`. Consumers of this struct *must* call `release` before + /// releasing this struct, or contents in `buffers` leak. + pub fn try_new(data: ArrayData) -> Result { let array = Arc::new(FFI_ArrowArray::new(&data)); let schema = Arc::new(FFI_ArrowSchema::try_from(data.data_type())?); Ok(ArrowArray { array, schema }) @@ -953,7 +956,7 @@ mod tests { .unwrap(); // export it - let array = ArrowArray::try_from(original_array.data().clone())?; + let array = ArrowArray::try_from(Array::data(&original_array).clone())?; // (simulate consumer) import it let data = ArrayData::try_from(array)?; @@ -1030,12 +1033,9 @@ mod tests { .collect::(); // Construct a list array from the above two - let list_data_type = match std::mem::size_of::() { - 4 => DataType::List(Box::new(Field::new("item", DataType::Int32, false))), - _ => { - DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))) - } - }; + let list_data_type = GenericListArray::::DATA_TYPE_CONSTRUCTOR(Box::new( + Field::new("item", DataType::Int32, false), + )); let list_data = ArrayData::builder(list_data_type) .len(3) diff --git a/arrow/src/ipc/compression/codec.rs b/arrow/src/ipc/compression/codec.rs new file mode 100644 index 000000000000..9d870fc22241 --- /dev/null +++ b/arrow/src/ipc/compression/codec.rs @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::buffer::Buffer; +use crate::error::{ArrowError, Result}; +use crate::ipc::CompressionType; +use std::io::{Read, Write}; + +const LENGTH_NO_COMPRESSED_DATA: i64 = -1; +const LENGTH_OF_PREFIX_DATA: i64 = 8; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Represents compressing a ipc stream using a particular compression algorithm +pub enum CompressionCodec { + Lz4Frame, + Zstd, +} + +impl TryFrom for CompressionCodec { + type Error = ArrowError; + + fn try_from(compression_type: CompressionType) -> Result { + match compression_type { + CompressionType::ZSTD => Ok(CompressionCodec::Zstd), + CompressionType::LZ4_FRAME => Ok(CompressionCodec::Lz4Frame), + other_type => Err(ArrowError::NotYetImplemented(format!( + "compression type {:?} not supported ", + other_type + ))), + } + } +} + +impl CompressionCodec { + /// Compresses the data in `input` to `output` and appends the + /// data using the specified compression mechanism. + /// + /// returns the number of bytes written to the stream + /// + /// Writes this format to output: + /// ```text + /// [8 bytes]: uncompressed length + /// [remaining bytes]: compressed data stream + /// ``` + pub(crate) fn compress_to_vec( + &self, + input: &[u8], + output: &mut Vec, + ) -> Result { + let uncompressed_data_len = input.len(); + let original_output_len = output.len(); + + if input.is_empty() { + // empty input, nothing to do + } else { + // write compressed data directly into the output buffer + output.extend_from_slice(&uncompressed_data_len.to_le_bytes()); + self.compress(input, output)?; + + let compression_len = output.len(); + if compression_len > uncompressed_data_len { + // length of compressed data was larger than + // uncompressed data, use the uncompressed data with + // length -1 to indicate that we don't compress the + // data + output.truncate(original_output_len); + output.extend_from_slice(&LENGTH_NO_COMPRESSED_DATA.to_le_bytes()); + output.extend_from_slice(input); + } + } + Ok(output.len() - original_output_len) + } + + /// Decompresses the input into a [`Buffer`] + /// + /// The input should look like: + /// ```text + /// [8 bytes]: uncompressed length + /// [remaining bytes]: compressed data stream + /// ``` + pub(crate) fn decompress_to_buffer(&self, input: &[u8]) -> Result { + // read the first 8 bytes to determine if the data is + // compressed + let decompressed_length = read_uncompressed_size(input); + let buffer = if decompressed_length == 0 { + // emtpy + let empty = Vec::::new(); + Buffer::from(empty) + } else if decompressed_length == LENGTH_NO_COMPRESSED_DATA { + // no compression + let data = &input[(LENGTH_OF_PREFIX_DATA as usize)..]; + Buffer::from(data) + } else { + // decompress data using the codec + let mut uncompressed_buffer = + Vec::with_capacity(decompressed_length as usize); + let input_data = &input[(LENGTH_OF_PREFIX_DATA as usize)..]; + self.decompress(input_data, &mut uncompressed_buffer)?; + Buffer::from(uncompressed_buffer) + }; + Ok(buffer) + } + + /// Compress the data in input buffer and write to output buffer + /// using the specified compression + fn compress(&self, input: &[u8], output: &mut Vec) -> Result<()> { + match self { + CompressionCodec::Lz4Frame => { + let mut encoder = lz4::EncoderBuilder::new().build(output)?; + encoder.write_all(input)?; + match encoder.finish().1 { + Ok(_) => Ok(()), + Err(e) => Err(e.into()), + } + } + CompressionCodec::Zstd => { + let mut encoder = zstd::Encoder::new(output, 0)?; + encoder.write_all(input)?; + match encoder.finish() { + Ok(_) => Ok(()), + Err(e) => Err(e.into()), + } + } + } + } + + /// Decompress the data in input buffer and write to output buffer + /// using the specified compression + fn decompress(&self, input: &[u8], output: &mut Vec) -> Result { + let result: Result = match self { + CompressionCodec::Lz4Frame => { + let mut decoder = lz4::Decoder::new(input)?; + match decoder.read_to_end(output) { + Ok(size) => Ok(size), + Err(e) => Err(e.into()), + } + } + CompressionCodec::Zstd => { + let mut decoder = zstd::Decoder::new(input)?; + match decoder.read_to_end(output) { + Ok(size) => Ok(size), + Err(e) => Err(e.into()), + } + } + }; + result + } +} + +/// Get the uncompressed length +/// Notes: +/// LENGTH_NO_COMPRESSED_DATA: indicate that the data that follows is not compressed +/// 0: indicate that there is no data +/// positive number: indicate the uncompressed length for the following data +#[inline] +fn read_uncompressed_size(buffer: &[u8]) -> i64 { + let len_buffer = &buffer[0..8]; + // 64-bit little-endian signed integer + i64::from_le_bytes(len_buffer.try_into().unwrap()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_lz4_compression() { + let input_bytes = "hello lz4".as_bytes(); + let codec: CompressionCodec = CompressionCodec::Lz4Frame; + let mut output_bytes: Vec = Vec::new(); + codec.compress(input_bytes, &mut output_bytes).unwrap(); + let mut result_output_bytes: Vec = Vec::new(); + codec + .decompress(output_bytes.as_slice(), &mut result_output_bytes) + .unwrap(); + assert_eq!(input_bytes, result_output_bytes.as_slice()); + } + + #[test] + fn test_zstd_compression() { + let input_bytes = "hello zstd".as_bytes(); + let codec: CompressionCodec = CompressionCodec::Zstd; + let mut output_bytes: Vec = Vec::new(); + codec.compress(input_bytes, &mut output_bytes).unwrap(); + let mut result_output_bytes: Vec = Vec::new(); + codec + .decompress(output_bytes.as_slice(), &mut result_output_bytes) + .unwrap(); + assert_eq!(input_bytes, result_output_bytes.as_slice()); + } +} diff --git a/arrow/src/ipc/compression/ipc_compression.rs b/arrow/src/ipc/compression/ipc_compression.rs deleted file mode 100644 index 3b7305c74b8e..000000000000 --- a/arrow/src/ipc/compression/ipc_compression.rs +++ /dev/null @@ -1,126 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::ipc::CompressionType; - -#[derive(Debug, Clone, Copy, PartialEq)] -pub enum CompressionCodecType { - Lz4Frame, - Zstd, -} - -impl From for CompressionCodecType { - fn from(compression_type: CompressionType) -> Self { - match compression_type { - CompressionType::ZSTD => CompressionCodecType::Zstd, - CompressionType::LZ4_FRAME => CompressionCodecType::Lz4Frame, - other_type => { - unimplemented!("Not support compression type: {:?}", other_type) - } - } - } -} - -impl From for CompressionType { - fn from(codec: CompressionCodecType) -> Self { - match codec { - CompressionCodecType::Lz4Frame => CompressionType::LZ4_FRAME, - CompressionCodecType::Zstd => CompressionType::ZSTD, - } - } -} - -#[cfg(any(feature = "ipc_compression", test))] -mod compression_function { - use crate::error::Result; - use crate::ipc::compression::ipc_compression::CompressionCodecType; - use std::io::{Read, Write}; - - impl CompressionCodecType { - pub fn compress(&self, input: &[u8], output: &mut Vec) -> Result<()> { - match self { - CompressionCodecType::Lz4Frame => { - let mut encoder = lz4::EncoderBuilder::new().build(output)?; - encoder.write_all(input)?; - match encoder.finish().1 { - Ok(_) => Ok(()), - Err(e) => Err(e.into()), - } - } - CompressionCodecType::Zstd => { - let mut encoder = zstd::Encoder::new(output, 0)?; - encoder.write_all(input)?; - match encoder.finish() { - Ok(_) => Ok(()), - Err(e) => Err(e.into()), - } - } - } - } - - pub fn decompress(&self, input: &[u8], output: &mut Vec) -> Result { - let result: Result = match self { - CompressionCodecType::Lz4Frame => { - let mut decoder = lz4::Decoder::new(input)?; - match decoder.read_to_end(output) { - Ok(size) => Ok(size), - Err(e) => Err(e.into()), - } - } - CompressionCodecType::Zstd => { - let mut decoder = zstd::Decoder::new(input)?; - match decoder.read_to_end(output) { - Ok(size) => Ok(size), - Err(e) => Err(e.into()), - } - } - }; - result - } - } -} - -#[cfg(test)] -mod tests { - use crate::ipc::compression::ipc_compression::CompressionCodecType; - - #[test] - fn test_lz4_compression() { - let input_bytes = "hello lz4".as_bytes(); - let codec: CompressionCodecType = CompressionCodecType::Lz4Frame; - let mut output_bytes: Vec = Vec::new(); - codec.compress(input_bytes, &mut output_bytes).unwrap(); - let mut result_output_bytes: Vec = Vec::new(); - codec - .decompress(output_bytes.as_slice(), &mut result_output_bytes) - .unwrap(); - assert_eq!(input_bytes, result_output_bytes.as_slice()); - } - - #[test] - fn test_zstd_compression() { - let input_bytes = "hello zstd".as_bytes(); - let codec: CompressionCodecType = CompressionCodecType::Zstd; - let mut output_bytes: Vec = Vec::new(); - codec.compress(input_bytes, &mut output_bytes).unwrap(); - let mut result_output_bytes: Vec = Vec::new(); - codec - .decompress(output_bytes.as_slice(), &mut result_output_bytes) - .unwrap(); - assert_eq!(input_bytes, result_output_bytes.as_slice()); - } -} diff --git a/arrow/src/ipc/compression/mod.rs b/arrow/src/ipc/compression/mod.rs index 1cdac812c800..666fa6d86a27 100644 --- a/arrow/src/ipc/compression/mod.rs +++ b/arrow/src/ipc/compression/mod.rs @@ -15,7 +15,12 @@ // specific language governing permissions and limitations // under the License. -pub(crate) mod ipc_compression; -pub(crate) const LENGTH_EMPTY_COMPRESSED_DATA: i64 = 0; -pub(crate) const LENGTH_NO_COMPRESSED_DATA: i64 = -1; -pub(crate) const LENGTH_OF_PREFIX_DATA: i64 = 8; +#[cfg(feature = "ipc_compression")] +mod codec; +#[cfg(feature = "ipc_compression")] +pub(crate) use codec::CompressionCodec; + +#[cfg(not(feature = "ipc_compression"))] +mod stub; +#[cfg(not(feature = "ipc_compression"))] +pub(crate) use stub::CompressionCodec; diff --git a/arrow/src/ipc/compression/stub.rs b/arrow/src/ipc/compression/stub.rs new file mode 100644 index 000000000000..6240f084be3f --- /dev/null +++ b/arrow/src/ipc/compression/stub.rs @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Stubs that implement the same interface as the ipc_compression +//! codec module, but always errors. + +use crate::buffer::Buffer; +use crate::error::{ArrowError, Result}; +use crate::ipc::CompressionType; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CompressionCodec {} + +impl TryFrom for CompressionType { + type Error = ArrowError; + fn try_from(codec: CompressionCodec) -> Result { + Err(ArrowError::InvalidArgumentError( + format!("codec type {:?} not supported because arrow was not compiled with the ipc_compression feature", codec))) + } +} + +impl TryFrom for CompressionCodec { + type Error = ArrowError; + + fn try_from(compression_type: CompressionType) -> Result { + Err(ArrowError::InvalidArgumentError( + format!("compression type {:?} not supported because arrow was not compiled with the ipc_compression feature", compression_type)) + ) + } +} + +impl CompressionCodec { + #[allow(clippy::ptr_arg)] + pub(crate) fn compress_to_vec( + &self, + _input: &[u8], + _output: &mut Vec, + ) -> Result { + Err(ArrowError::InvalidArgumentError( + "compression not supported because arrow was not compiled with the ipc_compression feature".to_string() + )) + } + + pub(crate) fn decompress_to_buffer(&self, _input: &[u8]) -> Result { + Err(ArrowError::InvalidArgumentError( + "decompression not supported because arrow was not compiled with the ipc_compression feature".to_string() + )) + } +} diff --git a/arrow/src/ipc/convert.rs b/arrow/src/ipc/convert.rs index dbbb6b961a10..705bd5cb3012 100644 --- a/arrow/src/ipc/convert.rs +++ b/arrow/src/ipc/convert.rs @@ -322,7 +322,7 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT let fsb = field.type_as_decimal().unwrap(); let bit_width = fsb.bitWidth(); if bit_width == 128 { - DataType::Decimal(fsb.precision() as usize, fsb.scale() as usize) + DataType::Decimal128(fsb.precision() as usize, fsb.scale() as usize) } else if bit_width == 256 { DataType::Decimal256(fsb.precision() as usize, fsb.scale() as usize) } else { @@ -667,7 +667,7 @@ pub(crate) fn get_fb_field_type<'a>( // type in the DictionaryEncoding metadata in the parent field get_fb_field_type(value_type, is_nullable, fbb) } - Decimal(precision, scale) => { + Decimal128(precision, scale) => { let mut builder = ipc::DecimalBuilder::new(fbb); builder.add_precision(*precision as i32); builder.add_scale(*scale as i32); @@ -965,7 +965,7 @@ mod tests { 123, true, ), - Field::new("decimal", DataType::Decimal(10, 6), false), + Field::new("decimal", DataType::Decimal128(10, 6), false), ], md, ); diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index a586dacd785f..393128371b19 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -21,6 +21,7 @@ //! however the `FileReader` expects a reader that supports `Seek`ing use std::collections::HashMap; +use std::fmt; use std::io::{BufReader, Read, Seek, SeekFrom}; use std::sync::Arc; @@ -32,11 +33,7 @@ use crate::error::{ArrowError, Result}; use crate::ipc; use crate::record_batch::{RecordBatch, RecordBatchOptions, RecordBatchReader}; -use crate::ipc::compression::ipc_compression::CompressionCodecType; -use crate::ipc::compression::{ - LENGTH_EMPTY_COMPRESSED_DATA, LENGTH_NO_COMPRESSED_DATA, LENGTH_OF_PREFIX_DATA, -}; -use crate::ipc::CompressionType; +use crate::ipc::compression::CompressionCodec; use ipc::CONTINUATION_MARKER; use DataType::*; @@ -52,60 +49,21 @@ use DataType::*; fn read_buffer( buf: &ipc::Buffer, a_data: &[u8], - compression_codec: &Option, -) -> Buffer { + compression_codec: &Option, +) -> Result { let start_offset = buf.offset() as usize; let end_offset = start_offset + buf.length() as usize; let buf_data = &a_data[start_offset..end_offset]; // corner case: empty buffer if buf_data.is_empty() { - return Buffer::from(buf_data); + return Ok(Buffer::from(buf_data)); } match compression_codec { - Some(_decompressor) if cfg!(feature = "ipc_compression") || cfg!(test) => { - // 8byte + data - // read the first 8 bytes - // if the data is compressed, decompress the data, otherwise return as is - let decompressed_length = read_uncompressed_size(buf_data); - if decompressed_length == LENGTH_EMPTY_COMPRESSED_DATA { - // emtpy - let empty = Vec::::new(); - Buffer::from(empty) - } else if decompressed_length == LENGTH_NO_COMPRESSED_DATA { - // not compress - let data = &buf_data[(LENGTH_OF_PREFIX_DATA as usize)..]; - Buffer::from(data) - } else { - // decompress data using the codec - let mut _uncompressed_buffer = - Vec::with_capacity(decompressed_length as usize); - let _input_data = &buf_data[(LENGTH_OF_PREFIX_DATA as usize)..]; - #[cfg(any(feature = "ipc_compression", test))] - _decompressor - .decompress(_input_data, &mut _uncompressed_buffer) - .unwrap(); - Buffer::from(_uncompressed_buffer) - } - } - None => Buffer::from(buf_data), - _ => { - panic!("IPC compression not supported. Compile with feature 'ipc_compression' to enable"); - } + Some(decompressor) => decompressor.decompress_to_buffer(buf_data), + None => Ok(Buffer::from(buf_data)), } } -/// Get the uncompressed length -/// Notes: -/// -1: indicate that the data that follows is not compressed -/// 0: indicate that there is no data -/// positive number: indicate the uncompressed length for the following data -#[inline] -fn read_uncompressed_size(buffer: &[u8]) -> i64 { - let len_buffer = &buffer[0..8]; - // 64-bit little-endian signed integer - i64::from_le_bytes(len_buffer.try_into().unwrap()) -} - /// Coordinates reading arrays based on data types. /// /// Notes: @@ -124,7 +82,7 @@ fn create_array( dictionaries_by_id: &HashMap, mut node_index: usize, mut buffer_index: usize, - compression_codec: &Option, + compression_codec: &Option, metadata: &ipc::MetadataVersion, ) -> Result<(ArrayRef, usize, usize)> { use DataType::*; @@ -137,7 +95,7 @@ fn create_array( buffers[buffer_index..buffer_index + 3] .iter() .map(|buf| read_buffer(buf, data, compression_codec)) - .collect(), + .collect::>()?, ); node_index += 1; buffer_index += 3; @@ -150,7 +108,7 @@ fn create_array( buffers[buffer_index..buffer_index + 2] .iter() .map(|buf| read_buffer(buf, data, compression_codec)) - .collect(), + .collect::>()?, ); node_index += 1; buffer_index += 2; @@ -161,7 +119,7 @@ fn create_array( let list_buffers: Vec = buffers[buffer_index..buffer_index + 2] .iter() .map(|buf| read_buffer(buf, data, compression_codec)) - .collect(); + .collect::>()?; node_index += 1; buffer_index += 2; let triple = create_array( @@ -185,7 +143,7 @@ fn create_array( let list_buffers: Vec = buffers[buffer_index..=buffer_index] .iter() .map(|buf| read_buffer(buf, data, compression_codec)) - .collect(); + .collect::>()?; node_index += 1; buffer_index += 1; let triple = create_array( @@ -207,7 +165,7 @@ fn create_array( Struct(struct_fields) => { let struct_node = &nodes[node_index]; let null_buffer: Buffer = - read_buffer(&buffers[buffer_index], data, compression_codec); + read_buffer(&buffers[buffer_index], data, compression_codec)?; node_index += 1; buffer_index += 1; @@ -246,7 +204,7 @@ fn create_array( let index_buffers: Vec = buffers[buffer_index..buffer_index + 2] .iter() .map(|buf| read_buffer(buf, data, compression_codec)) - .collect(); + .collect::>()?; let dict_id = field.dict_id().ok_or_else(|| { ArrowError::IoError(format!("Field {} does not have dict id", field)) @@ -277,12 +235,12 @@ fn create_array( // In V4, union types has validity bitmap // In V5 and later, union types have no validity bitmap if metadata < &ipc::MetadataVersion::V5 { - read_buffer(&buffers[buffer_index], data, compression_codec); + read_buffer(&buffers[buffer_index], data, compression_codec)?; buffer_index += 1; } let type_ids: Buffer = - read_buffer(&buffers[buffer_index], data, compression_codec)[..len] + read_buffer(&buffers[buffer_index], data, compression_codec)?[..len] .into(); buffer_index += 1; @@ -290,7 +248,7 @@ fn create_array( let value_offsets = match mode { UnionMode::Dense => { let buffer = - read_buffer(&buffers[buffer_index], data, compression_codec); + read_buffer(&buffers[buffer_index], data, compression_codec)?; buffer_index += 1; Some(buffer[..len * 4].into()) } @@ -349,7 +307,7 @@ fn create_array( buffers[buffer_index..buffer_index + 2] .iter() .map(|buf| read_buffer(buf, data, compression_codec)) - .collect(), + .collect::>()?, ); node_index += 1; buffer_index += 2; @@ -577,7 +535,7 @@ fn create_primitive_array( unsafe { builder.build_unchecked() } } - Decimal(_, _) | Decimal256(_, _) => { + Decimal128(_, _) | Decimal256(_, _) => { // read 3 buffers let builder = ArrayData::builder(data_type.clone()) .len(length) @@ -674,18 +632,10 @@ pub fn read_record_batch( let field_nodes = batch.nodes().ok_or_else(|| { ArrowError::IoError("Unable to get field nodes from IPC RecordBatch".to_string()) })?; - let option_compression = batch.compression(); - let compression_codec = match option_compression { - None => Ok(None), - Some(compression) => match compression.codec() { - CompressionType::ZSTD => Ok(Some(CompressionCodecType::Zstd)), - CompressionType::LZ4_FRAME => Ok(Some(CompressionCodecType::Lz4Frame)), - other_type => Err(ArrowError::InvalidArgumentError(format!( - "Not support compression type: {:?}", - other_type - ))), - }, - }?; + let batch_compression = batch.compression(); + let compression_codec: Option = batch_compression + .map(|batch_compression| batch_compression.codec().try_into()) + .transpose()?; // keep track of buffer and node index, the functions that create arrays mutate these let mut buffer_index = 0; @@ -847,6 +797,21 @@ pub struct FileReader { projection: Option<(Vec, Schema)>, } +impl fmt::Debug for FileReader { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(), fmt::Error> { + f.debug_struct("FileReader") + .field("reader", &"BufReader<..>") + .field("schema", &self.schema) + .field("blocks", &self.blocks) + .field("current_block", &self.current_block) + .field("total_blocks", &self.total_blocks) + .field("dictionaries_by_id", &self.dictionaries_by_id) + .field("metadata_version", &self.metadata_version) + .field("projection", &self.projection) + .finish() + } +} + impl FileReader { /// Try to create a new file reader /// @@ -1098,6 +1063,18 @@ pub struct StreamReader { projection: Option<(Vec, Schema)>, } +impl fmt::Debug for StreamReader { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(), fmt::Error> { + f.debug_struct("StreamReader") + .field("reader", &"BufReader<..>") + .field("schema", &self.schema) + .field("dictionaries_by_id", &self.dictionaries_by_id) + .field("finished", &self.finished) + .field("projection", &self.projection) + .finish() + } +} + impl StreamReader { /// Try to create a new stream reader /// @@ -1300,7 +1277,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1421,7 +1398,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); // the next batch must be empty assert!(reader.next().is_none()); // the stream must indicate that it's finished @@ -1458,7 +1435,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1491,7 +1468,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); // the next batch must be empty assert!(reader.next().is_none()); // the stream must indicate that it's finished @@ -1500,6 +1477,7 @@ mod tests { } #[test] + #[cfg(feature = "ipc_compression")] fn read_generated_streams_200() { let testdata = crate::util::test_util::arrow_test_data(); let version = "2.0.0-compression"; @@ -1517,7 +1495,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); // the next batch must be empty assert!(reader.next().is_none()); // the stream must indicate that it's finished @@ -1526,6 +1504,32 @@ mod tests { } #[test] + #[cfg(not(feature = "ipc_compression"))] + fn read_generated_streams_200_negative() { + let testdata = crate::util::test_util::arrow_test_data(); + let version = "2.0.0-compression"; + + // the test is repetitive, thus we can read all supported files at once + let cases = vec![("generated_lz4", "LZ4_FRAME"), ("generated_zstd", "ZSTD")]; + cases.iter().for_each(|(path, compression_name)| { + let file = File::open(format!( + "{}/arrow-ipc-stream/integration/{}/{}.stream", + testdata, version, path + )) + .unwrap(); + + let mut reader = StreamReader::try_new(file, None).unwrap(); + let err = reader.next().unwrap().unwrap_err(); + let expected_error = format!( + "Invalid argument error: compression type {} not supported because arrow was not compiled with the ipc_compression feature", + compression_name + ); + assert_eq!(err.to_string(), expected_error); + }); + } + + #[test] + #[cfg(feature = "ipc_compression")] fn read_generated_files_200() { let testdata = crate::util::test_util::arrow_test_data(); let version = "2.0.0-compression"; @@ -1542,7 +1546,32 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); + }); + } + + #[test] + #[cfg(not(feature = "ipc_compression"))] + fn read_generated_files_200_negative() { + let testdata = crate::util::test_util::arrow_test_data(); + let version = "2.0.0-compression"; + // the test is repetitive, thus we can read all supported files at once + let cases = vec![("generated_lz4", "LZ4_FRAME"), ("generated_zstd", "ZSTD")]; + cases.iter().for_each(|(path, compression_name)| { + let file = File::open(format!( + "{}/arrow-ipc-stream/integration/{}/{}.arrow_file", + testdata, version, path + )) + .unwrap(); + + let mut reader = FileReader::try_new(file, None).unwrap(); + + let err = reader.next().unwrap().unwrap_err(); + let expected_error = format!( + "Invalid argument error: compression type {} not supported because arrow was not compiled with the ipc_compression feature", + compression_name + ); + assert_eq!(err.to_string(), expected_error); }); } diff --git a/arrow/src/ipc/writer.rs b/arrow/src/ipc/writer.rs index 1dc3c02ec2a8..c817cb77c13a 100644 --- a/arrow/src/ipc/writer.rs +++ b/arrow/src/ipc/writer.rs @@ -39,11 +39,7 @@ use crate::ipc; use crate::record_batch::RecordBatch; use crate::util::bit_util; -use crate::ipc::compression::ipc_compression::CompressionCodecType; -use crate::ipc::compression::{ - LENGTH_EMPTY_COMPRESSED_DATA, LENGTH_NO_COMPRESSED_DATA, LENGTH_OF_PREFIX_DATA, -}; -use crate::ipc::CompressionType; +use crate::ipc::compression::CompressionCodec; use ipc::CONTINUATION_MARKER; /// IPC write options used to control the behaviour of the writer @@ -63,50 +59,29 @@ pub struct IpcWriteOptions { /// version 2.0.0: V4, with legacy format enabled /// version 4.0.0: V5 metadata_version: ipc::MetadataVersion, - batch_compression_type: Option, + /// Compression, if desired. Only supported when `ipc_compression` + /// feature is enabled + batch_compression_type: Option, } impl IpcWriteOptions { - #[cfg(any(feature = "ipc_compression", test))] - pub fn try_new_with_compression( - alignment: usize, - write_legacy_ipc_format: bool, - metadata_version: ipc::MetadataVersion, - batch_compression_type: Option, + /// Configures compression when writing IPC files. Requires the + /// `ipc_compression` feature of the crate to be activated. + #[cfg(feature = "ipc_compression")] + pub fn try_with_compression( + mut self, + batch_compression_type: Option, ) -> Result { - if alignment == 0 || alignment % 8 != 0 { + self.batch_compression_type = batch_compression_type; + + if self.batch_compression_type.is_some() + && self.metadata_version < ipc::MetadataVersion::V5 + { return Err(ArrowError::InvalidArgumentError( - "Alignment should be greater than 0 and be a multiple of 8".to_string(), + "Compression only supported in metadata v5 and above".to_string(), )); } - match batch_compression_type { - None => {} - _ => { - if metadata_version < ipc::MetadataVersion::V5 { - return Err(ArrowError::InvalidArgumentError( - "Compression only supported in metadata v5 and above".to_string(), - )); - } - } - }; - match metadata_version { - ipc::MetadataVersion::V5 => { - if write_legacy_ipc_format { - Err(ArrowError::InvalidArgumentError( - "Legacy IPC format only supported on metadata version 4" - .to_string(), - )) - } else { - Ok(Self { - alignment, - write_legacy_ipc_format, - metadata_version, - batch_compression_type, - }) - } - } - z => panic!("Unsupported ipc::MetadataVersion {:?}", z), - } + Ok(self) } /// Try create IpcWriteOptions, checking for incompatible settings pub fn try_new( @@ -146,7 +121,10 @@ impl IpcWriteOptions { }) } } - z => panic!("Unsupported ipc::MetadataVersion {:?}", z), + z => Err(ArrowError::InvalidArgumentError(format!( + "Unsupported ipc::MetadataVersion {:?}", + z + ))), } } } @@ -328,7 +306,7 @@ impl IpcDataGenerator { dict_id, dict_values, write_options, - )); + )?); } } _ => self._encode_dictionaries( @@ -362,7 +340,7 @@ impl IpcDataGenerator { )?; } - let encoded_message = self.record_batch_to_bytes(batch, write_options); + let encoded_message = self.record_batch_to_bytes(batch, write_options)?; Ok((encoded_dictionaries, encoded_message)) } @@ -372,7 +350,7 @@ impl IpcDataGenerator { &self, batch: &RecordBatch, write_options: &IpcWriteOptions, - ) -> EncodedData { + ) -> Result { let mut fbb = FlatBufferBuilder::new(); let mut nodes: Vec = vec![]; @@ -381,19 +359,18 @@ impl IpcDataGenerator { let mut offset = 0; // get the type of compression - let compression_codec = write_options.batch_compression_type; - let compression_type: Option = - compression_codec.map(|v| v.into()); - let compression = { - if let Some(codec) = compression_type { - let mut c = ipc::BodyCompressionBuilder::new(&mut fbb); - c.add_method(ipc::BodyCompressionMethod::BUFFER); - c.add_codec(codec); - Some(c.finish()) - } else { - None - } - }; + let batch_compression_type = write_options.batch_compression_type; + + let compression = batch_compression_type.map(|batch_compression_type| { + let mut c = ipc::BodyCompressionBuilder::new(&mut fbb); + c.add_method(ipc::BodyCompressionMethod::BUFFER); + c.add_codec(batch_compression_type); + c.finish() + }); + + let compression_codec: Option = + batch_compression_type.map(TryInto::try_into).transpose()?; + for array in batch.columns() { let array_data = array.data(); offset = write_array_data( @@ -406,7 +383,7 @@ impl IpcDataGenerator { array.null_count(), &compression_codec, write_options, - ); + )?; } // pad the tail of body data let len = arrow_data.len(); @@ -437,10 +414,10 @@ impl IpcDataGenerator { fbb.finish(root, None); let finished_data = fbb.finished_data(); - EncodedData { + Ok(EncodedData { ipc_message: finished_data.to_vec(), arrow_data, - } + }) } /// Write dictionary values into two sets of bytes, one for the header (ipc::Message) and the @@ -450,7 +427,7 @@ impl IpcDataGenerator { dict_id: i64, array_data: &ArrayData, write_options: &IpcWriteOptions, - ) -> EncodedData { + ) -> Result { let mut fbb = FlatBufferBuilder::new(); let mut nodes: Vec = vec![]; @@ -458,19 +435,19 @@ impl IpcDataGenerator { let mut arrow_data: Vec = vec![]; // get the type of compression - let compression_codec = write_options.batch_compression_type; - let compression_type: Option = - compression_codec.map(|v| v.into()); - let compression = { - if let Some(codec) = compression_type { - let mut c = ipc::BodyCompressionBuilder::new(&mut fbb); - c.add_method(ipc::BodyCompressionMethod::BUFFER); - c.add_codec(codec); - Some(c.finish()) - } else { - None - } - }; + let batch_compression_type = write_options.batch_compression_type; + + let compression = batch_compression_type.map(|batch_compression_type| { + let mut c = ipc::BodyCompressionBuilder::new(&mut fbb); + c.add_method(ipc::BodyCompressionMethod::BUFFER); + c.add_codec(batch_compression_type); + c.finish() + }); + + let compression_codec: Option = batch_compression_type + .map(|batch_compression_type| batch_compression_type.try_into()) + .transpose()?; + write_array_data( array_data, &mut buffers, @@ -481,7 +458,7 @@ impl IpcDataGenerator { array_data.null_count(), &compression_codec, write_options, - ); + )?; // pad the tail of body data let len = arrow_data.len(); @@ -522,10 +499,10 @@ impl IpcDataGenerator { fbb.finish(root, None); let finished_data = fbb.finished_data(); - EncodedData { + Ok(EncodedData { ipc_message: finished_data.to_vec(), arrow_data, - } + }) } } @@ -614,13 +591,11 @@ impl FileWriter { ) -> Result { let data_gen = IpcDataGenerator::default(); let mut writer = BufWriter::new(writer); - // write magic to header - let mut header_size: usize = 0; + // write magic to header aligned on 8 byte boundary + let header_size = super::ARROW_MAGIC.len() + 2; + assert_eq!(header_size, 8); writer.write_all(&super::ARROW_MAGIC[..])?; - header_size += super::ARROW_MAGIC.len(); - // create an 8-byte boundary after the header writer.write_all(&[0, 0])?; - header_size += 2; // write the schema, set the written bytes to the schema + header let encoded_message = data_gen.schema_to_bytes(schema, &write_options); let (meta, data) = write_message(&mut writer, encoded_message, &write_options)?; @@ -982,6 +957,16 @@ fn get_buffer_element_width(spec: &BufferSpec) -> usize { } } +/// Returns byte width for binary value_offset buffer spec. +#[inline] +fn get_value_offset_byte_width(data_type: &DataType) -> usize { + match data_type { + DataType::Binary | DataType::Utf8 => 4, + DataType::LargeBinary | DataType::LargeUtf8 => 8, + _ => unreachable!(), + } +} + /// Returns the number of total bytes in base binary arrays. fn get_binary_buffer_len(array_data: &ArrayData) -> usize { if array_data.is_empty() { @@ -1072,9 +1057,9 @@ fn write_array_data( offset: i64, num_rows: usize, null_count: usize, - compression_codec: &Option, + compression_codec: &Option, write_options: &IpcWriteOptions, -) -> i64 { +) -> Result { let mut offset = offset; if !matches!(array_data.data_type(), DataType::Null) { nodes.push(ipc::FieldNode::new(num_rows as i64, null_count as i64)); @@ -1102,7 +1087,7 @@ fn write_array_data( arrow_data, offset, compression_codec, - ); + )?; } let data_type = array_data.data_type(); @@ -1110,13 +1095,16 @@ fn write_array_data( data_type, DataType::Binary | DataType::LargeBinary | DataType::Utf8 | DataType::LargeUtf8 ) { - let total_bytes = get_binary_buffer_len(array_data); - let value_buffer = &array_data.buffers()[1]; + let offset_buffer = &array_data.buffers()[0]; + let value_offset_byte_width = get_value_offset_byte_width(data_type); + let min_length = (array_data.len() + 1) * value_offset_byte_width; if buffer_need_truncate( array_data.offset(), - value_buffer, - &BufferSpec::VariableWidth, - total_bytes, + offset_buffer, + &BufferSpec::FixedWidth { + byte_width: value_offset_byte_width, + }, + min_length, ) { // Rebase offsets and truncate values let (new_offsets, byte_offset) = @@ -1138,8 +1126,10 @@ fn write_array_data( arrow_data, offset, compression_codec, - ); + )?; + let total_bytes = get_binary_buffer_len(array_data); + let value_buffer = &array_data.buffers()[1]; let buffer_length = min(total_bytes, value_buffer.len() - byte_offset); let buffer_slice = &value_buffer.as_slice()[byte_offset..(byte_offset + buffer_length)]; @@ -1149,17 +1139,17 @@ fn write_array_data( arrow_data, offset, compression_codec, - ); + )?; } else { - array_data.buffers().iter().for_each(|buffer| { + for buffer in array_data.buffers() { offset = write_buffer( buffer.as_slice(), buffers, arrow_data, offset, compression_codec, - ); - }); + )?; + } } } else if DataType::is_numeric(data_type) || DataType::is_temporal(data_type) @@ -1188,7 +1178,7 @@ fn write_array_data( arrow_data, offset, compression_codec, - ); + )?; } else { offset = write_buffer( buffer.as_slice(), @@ -1196,17 +1186,18 @@ fn write_array_data( arrow_data, offset, compression_codec, - ); + )?; } } else { - array_data.buffers().iter().for_each(|buffer| { - offset = write_buffer(buffer, buffers, arrow_data, offset, compression_codec); - }); + for buffer in array_data.buffers() { + offset = + write_buffer(buffer, buffers, arrow_data, offset, compression_codec)?; + } } if !matches!(array_data.data_type(), DataType::Dictionary(_, _)) { // recursively write out nested structures - array_data.child_data().iter().for_each(|data_ref| { + for data_ref in array_data.child_data() { // write the nested data (e.g list data) offset = write_array_data( data_ref, @@ -1218,14 +1209,17 @@ fn write_array_data( data_ref.null_count(), compression_codec, write_options, - ); - }); + )?; + } } - offset + Ok(offset) } -/// Write a buffer to a vector of bytes, and add its ipc::Buffer to a vector +/// Write a buffer into `arrow_data`, a vector of bytes, and adds its +/// [`ipc::Buffer`] to `buffers`. Returns the new offset in `arrow_data` +/// +/// /// From /// Each constituent buffer is first compressed with the indicated /// compressor, and then written with the uncompressed length in the first 8 @@ -1235,61 +1229,34 @@ fn write_array_data( /// follows is not compressed, which can be useful for cases where /// compression does not yield appreciable savings. fn write_buffer( - buffer: &[u8], - buffers: &mut Vec, - arrow_data: &mut Vec, - offset: i64, - compression_codec: &Option, -) -> i64 { - let origin_buffer_len = buffer.len(); - let mut _compression_buffer = Vec::::new(); - let (data, uncompression_buffer_len) = match compression_codec { + buffer: &[u8], // input + buffers: &mut Vec, // output buffer descriptors + arrow_data: &mut Vec, // output stream + offset: i64, // current output stream offset + compression_codec: &Option, +) -> Result { + let len: i64 = match compression_codec { + Some(compressor) => compressor.compress_to_vec(buffer, arrow_data)?, None => { - // this buffer_len will not used in the following logic - // If we don't use the compression, just write the data in the array - (buffer, origin_buffer_len as i64) - } - Some(_compressor) => { - if cfg!(feature = "ipc_compression") || cfg!(test) { - if (origin_buffer_len as i64) == LENGTH_EMPTY_COMPRESSED_DATA { - (buffer, LENGTH_EMPTY_COMPRESSED_DATA) - } else { - #[cfg(any(feature = "ipc_compression", test))] - _compressor - .compress(buffer, &mut _compression_buffer) - .unwrap(); - let compression_len = _compression_buffer.len(); - if compression_len > origin_buffer_len { - // the length of compressed data is larger than uncompressed data - // use the uncompressed data with -1 - // -1 indicate that we don't compress the data - (buffer, LENGTH_NO_COMPRESSED_DATA) - } else { - // use the compressed data with uncompressed length - (_compression_buffer.as_slice(), origin_buffer_len as i64) - } - } - } else { - panic!("IPC compression not supported. Compile with feature 'ipc_compression' to enable"); - } + arrow_data.extend_from_slice(buffer); + buffer.len() } - }; - let len = data.len() as i64; - let total_len = if compression_codec.is_none() { - buffers.push(ipc::Buffer::new(offset, len)); - len - } else { - buffers.push(ipc::Buffer::new(offset, LENGTH_OF_PREFIX_DATA + len)); - // write the prefix of the uncompressed length - let uncompression_len_buf: [u8; 8] = uncompression_buffer_len.to_le_bytes(); - arrow_data.extend_from_slice(&uncompression_len_buf); - LENGTH_OF_PREFIX_DATA + len - }; - arrow_data.extend_from_slice(data); + } + .try_into() + .map_err(|e| { + ArrowError::InvalidArgumentError(format!( + "Could not convert compressed size to i64: {}", + e + )) + })?; + + // make new index entry + buffers.push(ipc::Buffer::new(offset, len)); // padding and make offset 8 bytes aligned let pad_len = pad_to_8(len as u32) as i64; arrow_data.extend_from_slice(&vec![0u8; pad_len as usize][..]); - offset + total_len + pad_len + + Ok(offset + len + pad_len) } /// Calculate an 8-byte boundary and return the number of bytes needed to pad to 8 bytes @@ -1315,7 +1282,8 @@ mod tests { use crate::util::integration_util::*; #[test] - fn test_write_with_empty_record_batch() { + #[cfg(feature = "ipc_compression")] + fn test_write_empty_record_batch_lz4_compression() { let file_name = "arrow_lz4_empty"; let schema = Schema::new(vec![Field::new("field1", DataType::Int32, true)]); let values: Vec> = vec![]; @@ -1327,13 +1295,12 @@ mod tests { let file = File::create(format!("target/debug/testdata/{}.arrow_file", file_name)) .unwrap(); - let write_option = IpcWriteOptions::try_new_with_compression( - 8, - false, - ipc::MetadataVersion::V5, - Some(CompressionCodecType::Lz4Frame), - ) - .unwrap(); + let write_option = + IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) + .unwrap(); + let mut writer = FileWriter::try_new_with_options(file, &schema, write_option).unwrap(); writer.write(&record_batch).unwrap(); @@ -1368,7 +1335,9 @@ mod tests { } } } + #[test] + #[cfg(feature = "ipc_compression")] fn test_write_file_with_lz4_compression() { let schema = Schema::new(vec![Field::new("field1", DataType::Int32, true)]); let values: Vec> = vec![Some(12), Some(1)]; @@ -1379,13 +1348,12 @@ mod tests { { let file = File::create("target/debug/testdata/arrow_lz4.arrow_file").unwrap(); - let write_option = IpcWriteOptions::try_new_with_compression( - 8, - false, - ipc::MetadataVersion::V5, - Some(CompressionCodecType::Lz4Frame), - ) - .unwrap(); + let write_option = + IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) + .unwrap(); + let mut writer = FileWriter::try_new_with_options(file, &schema, write_option).unwrap(); writer.write(&record_batch).unwrap(); @@ -1422,6 +1390,7 @@ mod tests { } #[test] + #[cfg(feature = "ipc_compression")] fn test_write_file_with_zstd_compression() { let schema = Schema::new(vec![Field::new("field1", DataType::Int32, true)]); let values: Vec> = vec![Some(12), Some(1)]; @@ -1432,13 +1401,12 @@ mod tests { { let file = File::create("target/debug/testdata/arrow_zstd.arrow_file").unwrap(); - let write_option = IpcWriteOptions::try_new_with_compression( - 8, - false, - ipc::MetadataVersion::V5, - Some(CompressionCodecType::Zstd), - ) - .unwrap(); + let write_option = + IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(ipc::CompressionType::ZSTD)) + .unwrap(); + let mut writer = FileWriter::try_new_with_options(file, &schema, write_option).unwrap(); writer.write(&record_batch).unwrap(); @@ -1650,7 +1618,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1701,7 +1669,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1765,7 +1733,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -1826,11 +1794,12 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } #[test] + #[cfg(feature = "ipc_compression")] fn read_and_rewrite_compression_files_200() { let testdata = crate::util::test_util::arrow_test_data(); let version = "2.0.0-compression"; @@ -1853,13 +1822,12 @@ mod tests { )) .unwrap(); // write IPC version 5 - let options = IpcWriteOptions::try_new_with_compression( - 8, - false, - ipc::MetadataVersion::V5, - Some(CompressionCodecType::Lz4Frame), - ) - .unwrap(); + let options = + IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(ipc::CompressionType::LZ4_FRAME)) + .unwrap(); + let mut writer = FileWriter::try_new_with_options(file, &reader.schema(), options) .unwrap(); @@ -1878,11 +1846,12 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } #[test] + #[cfg(feature = "ipc_compression")] fn read_and_rewrite_compression_stream_200() { let testdata = crate::util::test_util::arrow_test_data(); let version = "2.0.0-compression"; @@ -1904,13 +1873,12 @@ mod tests { version, path )) .unwrap(); - let options = IpcWriteOptions::try_new_with_compression( - 8, - false, - ipc::MetadataVersion::V5, - Some(CompressionCodecType::Zstd), - ) - .unwrap(); + let options = + IpcWriteOptions::try_new(8, false, ipc::MetadataVersion::V5) + .unwrap() + .try_with_compression(Some(ipc::CompressionType::ZSTD)) + .unwrap(); + let mut writer = StreamWriter::try_new_with_options(file, &reader.schema(), options) .unwrap(); @@ -1927,7 +1895,7 @@ mod tests { // read expected JSON output let arrow_json = read_gzip_json(version, path); - assert!(arrow_json.equals_reader(&mut reader)); + assert!(arrow_json.equals_reader(&mut reader).unwrap()); }); } @@ -2279,4 +2247,21 @@ mod tests { assert!(structs.column(1).is_null(1)); assert_eq!(record_batch_slice, deserialized_batch); } + + #[test] + fn truncate_ipc_string_array_with_all_empty_string() { + fn create_batch() -> RecordBatch { + let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); + let a = + StringArray::from(vec![Some(""), Some(""), Some(""), Some(""), Some("")]); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap() + } + + let record_batch = create_batch(); + let record_batch_slice = record_batch.slice(0, 1); + let deserialized_batch = deserialize(serialize(&record_batch_slice)); + + assert!(serialize(&record_batch).len() > serialize(&record_batch_slice).len()); + assert_eq!(record_batch_slice, deserialized_batch); + } } diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index 9b348e629169..66fdc691887b 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -590,7 +590,7 @@ pub struct Decoder { options: DecoderOptions, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] /// Options for JSON decoding pub struct DecoderOptions { /// Batch size (number of records to load each time), defaults to 1024 records diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 95c69ca0be6d..04f495dc0819 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -18,6 +18,9 @@ //! A complete, safe, native Rust implementation of [Apache Arrow](https://arrow.apache.org), a cross-language //! development platform for in-memory data. //! +//! Please see the [arrow crates.io](https://crates.io/crates/arrow) +//! page for feature flags and tips to improve performance. +//! //! # Columnar Format //! //! The [`array`] module provides statically typed implementations of all the array @@ -57,6 +60,23 @@ //! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6); //! ``` //! +//! And the following is generic over all arrays with comparable values +//! +//! ```rust +//! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray}; +//! # use arrow::datatypes::ArrowPrimitiveType; +//! # +//! fn min(array: T) -> Option +//! where +//! T::Item: Ord +//! { +//! ArrayIter::new(array).filter_map(|v| v).min() +//! } +//! +//! assert_eq!(min(&Int32Array::from(vec![4, 2, 1, 6])), Some(1)); +//! assert_eq!(min(&StringArray::from(vec!["b", "a", "c"])), Some("a")); +//! ``` +//! //! For more examples, consult the [`array`] docs. //! //! # Type Erasure / Trait Objects @@ -238,7 +258,9 @@ pub mod compute; pub mod csv; pub mod datatypes; pub mod error; +#[cfg(feature = "ffi")] pub mod ffi; +#[cfg(feature = "ffi")] pub mod ffi_stream; #[cfg(feature = "ipc")] pub mod ipc; diff --git a/arrow/src/temporal_conversions.rs b/arrow/src/temporal_conversions.rs index 2d6d6776f59e..12982b7dabc2 100644 --- a/arrow/src/temporal_conversions.rs +++ b/arrow/src/temporal_conversions.rs @@ -20,13 +20,18 @@ use chrono::{Duration, NaiveDateTime, NaiveTime}; /// Number of seconds in a day -const SECONDS_IN_DAY: i64 = 86_400; +pub(crate) const SECONDS_IN_DAY: i64 = 86_400; /// Number of milliseconds in a second -const MILLISECONDS: i64 = 1_000; +pub(crate) const MILLISECONDS: i64 = 1_000; /// Number of microseconds in a second -const MICROSECONDS: i64 = 1_000_000; +pub(crate) const MICROSECONDS: i64 = 1_000_000; /// Number of nanoseconds in a second -const NANOSECONDS: i64 = 1_000_000_000; +pub(crate) const NANOSECONDS: i64 = 1_000_000_000; + +/// Number of milliseconds in a day +pub(crate) const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS; +/// Number of days between 0001-01-01 and 1970-01-01 +pub(crate) const EPOCH_DAYS_FROM_CE: i32 = 719_163; /// converts a `i32` representing a `date32` to [`NaiveDateTime`] #[inline] @@ -37,11 +42,13 @@ pub fn date32_to_datetime(v: i32) -> NaiveDateTime { /// converts a `i64` representing a `date64` to [`NaiveDateTime`] #[inline] pub fn date64_to_datetime(v: i64) -> NaiveDateTime { + let (sec, milli_sec) = split_second(v, MILLISECONDS); + NaiveDateTime::from_timestamp( // extract seconds from milliseconds - v / MILLISECONDS, + sec, // discard extracted seconds and convert milliseconds to nanoseconds - (v % MILLISECONDS * MICROSECONDS) as u32, + milli_sec * MICROSECONDS as u32, ) } @@ -96,36 +103,59 @@ pub fn timestamp_s_to_datetime(v: i64) -> NaiveDateTime { /// converts a `i64` representing a `timestamp(ms)` to [`NaiveDateTime`] #[inline] pub fn timestamp_ms_to_datetime(v: i64) -> NaiveDateTime { + let (sec, milli_sec) = split_second(v, MILLISECONDS); + NaiveDateTime::from_timestamp( // extract seconds from milliseconds - v / MILLISECONDS, + sec, // discard extracted seconds and convert milliseconds to nanoseconds - (v % MILLISECONDS * MICROSECONDS) as u32, + milli_sec * MICROSECONDS as u32, ) } /// converts a `i64` representing a `timestamp(us)` to [`NaiveDateTime`] #[inline] pub fn timestamp_us_to_datetime(v: i64) -> NaiveDateTime { + let (sec, micro_sec) = split_second(v, MICROSECONDS); + NaiveDateTime::from_timestamp( // extract seconds from microseconds - v / MICROSECONDS, + sec, // discard extracted seconds and convert microseconds to nanoseconds - (v % MICROSECONDS * MILLISECONDS) as u32, + micro_sec * MILLISECONDS as u32, ) } /// converts a `i64` representing a `timestamp(ns)` to [`NaiveDateTime`] #[inline] pub fn timestamp_ns_to_datetime(v: i64) -> NaiveDateTime { + let (sec, nano_sec) = split_second(v, NANOSECONDS); + NaiveDateTime::from_timestamp( // extract seconds from nanoseconds - v / NANOSECONDS, - // discard extracted seconds - (v % NANOSECONDS) as u32, + sec, // discard extracted seconds + nano_sec, ) } +/// +#[inline] +pub(crate) fn split_second(v: i64, base: i64) -> (i64, u32) { + if v < 0 { + let v = -v; + let mut seconds = v / base; + let mut part = v % base; + + if part > 0 { + seconds += 1; + part = base - part; + } + (-seconds, part as u32) + } else { + (v / base, (v % base) as u32) + } +} + /// converts a `i64` representing a `duration(s)` to [`Duration`] #[inline] pub fn duration_s_to_duration(v: i64) -> Duration { @@ -149,3 +179,83 @@ pub fn duration_us_to_duration(v: i64) -> Duration { pub fn duration_ns_to_duration(v: i64) -> Duration { Duration::nanoseconds(v) } + +#[cfg(test)] +mod tests { + use crate::temporal_conversions::{ + date64_to_datetime, split_second, timestamp_ms_to_datetime, + timestamp_ns_to_datetime, timestamp_us_to_datetime, NANOSECONDS, + }; + use chrono::NaiveDateTime; + + #[test] + fn negative_input_timestamp_ns_to_datetime() { + assert_eq!( + timestamp_ns_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_999_999) + ); + + assert_eq!( + timestamp_ns_to_datetime(-1_000_000_001), + NaiveDateTime::from_timestamp(-2, 999_999_999) + ); + } + + #[test] + fn negative_input_timestamp_us_to_datetime() { + assert_eq!( + timestamp_us_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_999_000) + ); + + assert_eq!( + timestamp_us_to_datetime(-1_000_001), + NaiveDateTime::from_timestamp(-2, 999_999_000) + ); + } + + #[test] + fn negative_input_timestamp_ms_to_datetime() { + assert_eq!( + timestamp_ms_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_000_000) + ); + + assert_eq!( + timestamp_ms_to_datetime(-1_001), + NaiveDateTime::from_timestamp(-2, 999_000_000) + ); + } + + #[test] + fn negative_input_date64_to_datetime() { + assert_eq!( + date64_to_datetime(-1), + NaiveDateTime::from_timestamp(-1, 999_000_000) + ); + + assert_eq!( + date64_to_datetime(-1_001), + NaiveDateTime::from_timestamp(-2, 999_000_000) + ); + } + + #[test] + fn test_split_seconds() { + let (sec, nano_sec) = split_second(100, NANOSECONDS); + assert_eq!(sec, 0); + assert_eq!(nano_sec, 100); + + let (sec, nano_sec) = split_second(123_000_000_456, NANOSECONDS); + assert_eq!(sec, 123); + assert_eq!(nano_sec, 456); + + let (sec, nano_sec) = split_second(-1, NANOSECONDS); + assert_eq!(sec, -1); + assert_eq!(nano_sec, 999_999_999); + + let (sec, nano_sec) = split_second(-123_000_000_001, NANOSECONDS); + assert_eq!(sec, -124); + assert_eq!(nano_sec, 999_999_999); + } +} diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 21b8ee8c9fd1..4d974409a0ee 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -143,6 +143,17 @@ pub fn create_random_array( }) .collect::>>()?, )?), + d @ Dictionary(_, value_type) + if crate::compute::can_cast_types(value_type, d) => + { + let f = Field::new( + field.name(), + value_type.as_ref().clone(), + field.is_nullable(), + ); + let v = create_random_array(&f, size, null_density, true_density)?; + crate::compute::cast(&v, d)? + } other => { return Err(ArrowError::NotYetImplemented(format!( "Generating random arrays not yet implemented for {:?}", diff --git a/arrow/src/util/decimal.rs b/arrow/src/util/decimal.rs index 8f9d394efd9a..1b9cf80fd11e 100644 --- a/arrow/src/util/decimal.rs +++ b/arrow/src/util/decimal.rs @@ -18,30 +18,62 @@ //! Decimal related utils use crate::datatypes::{ - DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE, + DataType, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, + DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, }; use crate::error::{ArrowError, Result}; use num::bigint::BigInt; use num::Signed; use std::cmp::{min, Ordering}; -pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { - /// The bit-width of the internal representation. - const BIT_WIDTH: usize; - /// The maximum precision. - const MAX_PRECISION: usize; - /// The maximum scale. - const MAX_SCALE: usize; +#[derive(Debug)] +pub struct BasicDecimal { + precision: usize, + scale: usize, + value: [u8; BYTE_WIDTH], +} + +impl BasicDecimal { + #[allow(clippy::type_complexity)] + const MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE: ( + usize, + usize, + fn(usize, usize) -> DataType, + DataType, + ) = match BYTE_WIDTH { + 16 => ( + DECIMAL128_MAX_PRECISION, + DECIMAL128_MAX_SCALE, + DataType::Decimal128, + DataType::Decimal128(DECIMAL128_MAX_PRECISION, DECIMAL_DEFAULT_SCALE), + ), + 32 => ( + DECIMAL256_MAX_PRECISION, + DECIMAL256_MAX_SCALE, + DataType::Decimal256, + DataType::Decimal256(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE), + ), + _ => panic!("invalid byte width"), + }; + + pub const MAX_PRECISION: usize = Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.0; + pub const MAX_SCALE: usize = Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.1; + pub const TYPE_CONSTRUCTOR: fn(usize, usize) -> DataType = + Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.2; + pub const DEFAULT_TYPE: DataType = + Self::MAX_PRECISION_SCALE_CONSTRUCTOR_DEFAULT_TYPE.3; /// Tries to create a decimal value from precision, scale and bytes. - /// If the length of bytes isn't same as the bit width of this decimal, - /// returning an error. The bytes should be stored in little-endian order. + /// The bytes should be stored in little-endian order. /// /// Safety: /// This method doesn't validate if the decimal value represented by the bytes /// can be fitted into the specified precision. - fn try_new_from_bytes(precision: usize, scale: usize, bytes: &[u8]) -> Result + pub fn try_new_from_bytes( + precision: usize, + scale: usize, + bytes: &[u8; BYTE_WIDTH], + ) -> Result where Self: Sized, { @@ -67,13 +99,13 @@ pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { ))); } - if bytes.len() == Self::BIT_WIDTH / 8 { + if bytes.len() == BYTE_WIDTH { Ok(Self::new(precision, scale, bytes)) } else { Err(ArrowError::InvalidArgumentError(format!( "Input to Decimal{} must be {} bytes", - Self::BIT_WIDTH, - Self::BIT_WIDTH / 8 + BYTE_WIDTH * 8, + BYTE_WIDTH ))) } } @@ -81,23 +113,35 @@ pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { /// Creates a decimal value from precision, scale, and bytes. /// /// Safety: - /// This method doesn't check if the length of bytes is compatible with this decimal. + /// This method doesn't check if the precision and scale are valid. /// Use `try_new_from_bytes` for safe constructor. - fn new(precision: usize, scale: usize, bytes: &[u8]) -> Self; - + pub fn new(precision: usize, scale: usize, bytes: &[u8; BYTE_WIDTH]) -> Self { + Self { + precision, + scale, + value: *bytes, + } + } /// Returns the raw bytes of the integer representation of the decimal. - fn raw_value(&self) -> &[u8]; + pub fn raw_value(&self) -> &[u8; BYTE_WIDTH] { + &self.value + } /// Returns the precision of the decimal. - fn precision(&self) -> usize; + pub fn precision(&self) -> usize { + self.precision + } /// Returns the scale of the decimal. - fn scale(&self) -> usize; + pub fn scale(&self) -> usize { + self.scale + } /// Returns the string representation of the decimal. /// If the string representation cannot be fitted with the precision of the decimal, /// the string will be truncated. - fn to_string(&self) -> String { + #[allow(clippy::inherent_to_string)] + pub fn to_string(&self) -> String { let raw_bytes = self.raw_value(); let integer = BigInt::from_signed_bytes_le(raw_bytes); let value_str = integer.to_string(); @@ -119,15 +163,44 @@ pub trait BasicDecimal: PartialOrd + Ord + PartialEq + Eq { } } +impl PartialOrd for BasicDecimal { + fn partial_cmp(&self, other: &Self) -> Option { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimals with different scale: {}, {}", + self.scale, other.scale + ); + Some(singed_cmp_le_bytes(&self.value, &other.value)) + } +} + +impl Ord for BasicDecimal { + fn cmp(&self, other: &Self) -> Ordering { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimals with different scale: {}, {}", + self.scale, other.scale + ); + singed_cmp_le_bytes(&self.value, &other.value) + } +} + +impl PartialEq for BasicDecimal { + fn eq(&self, other: &Self) -> bool { + assert_eq!( + self.scale, other.scale, + "Cannot compare two Decimals with different scale: {}, {}", + self.scale, other.scale + ); + self.value.eq(&other.value) + } +} + +impl Eq for BasicDecimal {} + /// Represents a decimal value with precision and scale. /// The decimal value could represented by a signed 128-bit integer. -#[derive(Debug)] -pub struct Decimal128 { - #[allow(dead_code)] - precision: usize, - scale: usize, - value: [u8; 16], -} +pub type Decimal128 = BasicDecimal<16>; impl Decimal128 { /// Creates `Decimal128` from an `i128` value. @@ -154,13 +227,7 @@ impl From for i128 { /// Represents a decimal value with precision and scale. /// The decimal value could be represented by a signed 256-bit integer. -#[derive(Debug)] -pub struct Decimal256 { - #[allow(dead_code)] - precision: usize, - scale: usize, - value: [u8; 32], -} +pub type Decimal256 = BasicDecimal<32>; impl Decimal256 { /// Constructs a `Decimal256` value from a `BigInt`. @@ -170,98 +237,68 @@ impl Decimal256 { scale: usize, ) -> Result { let mut bytes = if num.is_negative() { - vec![255; 32] + [255_u8; 32] } else { - vec![0; 32] + [0; 32] }; let num_bytes = &num.to_signed_bytes_le(); bytes[0..num_bytes.len()].clone_from_slice(num_bytes); Decimal256::try_new_from_bytes(precision, scale, &bytes) } -} -macro_rules! def_decimal { - ($ty:ident, $bit:expr, $max_p:expr, $max_s:expr) => { - impl BasicDecimal for $ty { - const BIT_WIDTH: usize = $bit; - const MAX_PRECISION: usize = $max_p; - const MAX_SCALE: usize = $max_s; - - fn new(precision: usize, scale: usize, bytes: &[u8]) -> Self { - $ty { - precision, - scale, - value: bytes.try_into().unwrap(), - } - } - - fn raw_value(&self) -> &[u8] { - &self.value - } - - fn precision(&self) -> usize { - self.precision - } - - fn scale(&self) -> usize { - self.scale - } - } + /// Constructs a `BigInt` from this `Decimal256` value. + pub(crate) fn to_big_int(&self) -> BigInt { + BigInt::from_signed_bytes_le(&self.value) + } +} - impl PartialOrd for $ty { - fn partial_cmp(&self, other: &Self) -> Option { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - self.value.partial_cmp(&other.value) +// compare two signed integer which are encoded with little endian. +// left bytes and right bytes must have the same length. +#[inline] +pub(crate) fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering { + assert_eq!( + left.len(), + right.len(), + "Can't compare bytes array with different len: {}, {}", + left.len(), + right.len() + ); + assert_ne!(left.len(), 0, "Can't compare bytes array of length 0"); + let len = left.len(); + // the sign bit is 1, the value is negative + let left_negative = left[len - 1] >= 0x80_u8; + let right_negative = right[len - 1] >= 0x80_u8; + if left_negative != right_negative { + return match left_negative { + true => { + // left is negative value + // right is positive value + Ordering::Less } - } - - impl Ord for $ty { - fn cmp(&self, other: &Self) -> Ordering { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - self.value.cmp(&other.value) + false => Ordering::Greater, + }; + } + for i in 0..len { + let l_byte = left[len - 1 - i]; + let r_byte = right[len - 1 - i]; + match l_byte.cmp(&r_byte) { + Ordering::Less => { + return Ordering::Less; } - } - - impl PartialEq for $ty { - fn eq(&self, other: &Self) -> bool { - assert_eq!( - self.scale, other.scale, - "Cannot compare two Decimals with different scale: {}, {}", - self.scale, other.scale - ); - self.value.eq(&other.value) + Ordering::Greater => { + return Ordering::Greater; } + Ordering::Equal => {} } - - impl Eq for $ty {} - }; + } + Ordering::Equal } -def_decimal!( - Decimal128, - 128, - DECIMAL128_MAX_PRECISION, - DECIMAL128_MAX_SCALE -); -def_decimal!( - Decimal256, - 256, - DECIMAL256_MAX_PRECISION, - DECIMAL256_MAX_SCALE -); - #[cfg(test)] mod tests { - use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256}; + use super::*; use num::{BigInt, Num}; + use rand::random; #[test] fn decimal_128_to_string() { @@ -312,9 +349,9 @@ mod tests { #[test] fn decimal_256_from_bytes() { - let mut bytes = vec![0; 32]; + let mut bytes = [0_u8; 32]; bytes[0..16].clone_from_slice(&100_i128.to_le_bytes()); - let value = Decimal256::try_new_from_bytes(5, 2, bytes.as_slice()).unwrap(); + let value = Decimal256::try_new_from_bytes(5, 2, &bytes).unwrap(); assert_eq!(value.to_string(), "1.00"); bytes[0..16].clone_from_slice(&i128::MAX.to_le_bytes()); @@ -334,7 +371,7 @@ mod tests { ); // smaller than i128 minimum - bytes = vec![255; 32]; + bytes = [255; 32]; bytes[31] = 128; let value = Decimal256::try_new_from_bytes(76, 4, &bytes).unwrap(); assert_eq!( @@ -342,7 +379,7 @@ mod tests { "-574437317700748313234121683441537667865831564552201235664496608164256541.5731" ); - bytes = vec![255; 32]; + bytes = [255; 32]; let value = Decimal256::try_new_from_bytes(5, 2, &bytes).unwrap(); assert_eq!(value.to_string(), "-0.01"); } @@ -368,4 +405,68 @@ mod tests { let value = Decimal256::from_big_int(&num, 76, 4).unwrap(); assert_eq!(value.to_string(), "-574437317700748313234121683441537667865831564552201235664496608164256541.5731"); } + + #[test] + fn test_lt_cmp_byte() { + for _i in 0..100 { + let left = random::(); + let right = random::(); + let result = singed_cmp_le_bytes( + left.to_le_bytes().as_slice(), + right.to_le_bytes().as_slice(), + ); + assert_eq!(left.cmp(&right), result); + } + for _i in 0..100 { + let left = random::(); + let right = random::(); + let result = singed_cmp_le_bytes( + left.to_le_bytes().as_slice(), + right.to_le_bytes().as_slice(), + ); + assert_eq!(left.cmp(&right), result); + } + } + + #[test] + fn compare_decimal128() { + let v1 = -100_i128; + let v2 = 10000_i128; + let right = Decimal128::new_from_i128(20, 3, v2); + for v in v1..v2 { + let left = Decimal128::new_from_i128(20, 3, v); + assert!(left < right); + } + + for _i in 0..100 { + let left = random::(); + let right = random::(); + let left_decimal = Decimal128::new_from_i128(38, 2, left); + let right_decimal = Decimal128::new_from_i128(38, 2, right); + assert_eq!(left < right, left_decimal < right_decimal); + assert_eq!(left == right, left_decimal == right_decimal) + } + } + + #[test] + fn compare_decimal256() { + let v1 = -100_i128; + let v2 = 10000_i128; + let right = Decimal256::from_big_int(&BigInt::from(v2), 75, 2).unwrap(); + for v in v1..v2 { + let left = Decimal256::from_big_int(&BigInt::from(v), 75, 2).unwrap(); + assert!(left < right); + } + + for _i in 0..100 { + let left = random::(); + let right = random::(); + let left_decimal = + Decimal256::from_big_int(&BigInt::from(left), 75, 2).unwrap(); + let right_decimal = + Decimal256::from_big_int(&BigInt::from(right), 75, 2).unwrap(); + assert_eq!(left < right, left_decimal < right_decimal); + assert_eq!(left == right, left_decimal == right_decimal) + } + } } diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index c97e0b1aa444..aa4fd4200870 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -23,7 +23,6 @@ use std::fmt::Write; use std::sync::Arc; use crate::array::Array; -use crate::array::BasicDecimalArray; use crate::datatypes::{ ArrowNativeType, ArrowPrimitiveType, DataType, Field, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type, @@ -319,7 +318,7 @@ pub fn array_value_to_string(column: &array::ArrayRef, row: usize) -> Result make_string!(array::Float16Array, column, row), DataType::Float32 => make_string!(array::Float32Array, column, row), DataType::Float64 => make_string!(array::Float64Array, column, row), - DataType::Decimal(..) => make_string_from_decimal(column, row), + DataType::Decimal128(..) => make_string_from_decimal(column, row), DataType::Timestamp(unit, _) if *unit == TimeUnit::Second => { make_string_datetime!(array::TimestampSecondArray, column, row) } diff --git a/arrow/src/util/integration_util.rs b/arrow/src/util/integration_util.rs index aadf0327734d..ee5c947a2fff 100644 --- a/arrow/src/util/integration_util.rs +++ b/arrow/src/util/integration_util.rs @@ -19,13 +19,22 @@ //! //! These utilities define structs that read the integration JSON format for integration testing purposes. +use hex::decode; +use num::BigInt; +use num::Signed; use serde_derive::{Deserialize, Serialize}; -use serde_json::{Map as SJMap, Number as VNumber, Value}; +use serde_json::{Map as SJMap, Value}; +use std::collections::HashMap; +use std::sync::Arc; use crate::array::*; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::compute; use crate::datatypes::*; -use crate::error::Result; +use crate::error::{ArrowError, Result}; use crate::record_batch::{RecordBatch, RecordBatchReader}; +use crate::util::bit_util; +use crate::util::decimal::Decimal256; /// A struct that represents an Arrow file with a schema and record batches #[derive(Deserialize, Serialize, Debug)] @@ -42,6 +51,8 @@ pub struct ArrowJson { #[derive(Deserialize, Serialize, Debug)] pub struct ArrowJsonSchema { pub fields: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub metadata: Option>>, } /// Fields are left as JSON `Value` as they vary by `DataType` @@ -107,14 +118,14 @@ pub struct DictionaryIndexType { } /// A struct that partially reads the Arrow JSON record batch -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] pub struct ArrowJsonBatch { count: usize, pub columns: Vec, } /// A struct that partially reads the Arrow JSON dictionary batch -#[derive(Deserialize, Serialize, Debug)] +#[derive(Deserialize, Serialize, Debug, Clone)] #[allow(non_snake_case)] pub struct ArrowJsonDictionaryBatch { pub id: i64, @@ -139,17 +150,45 @@ pub struct ArrowJsonColumn { impl ArrowJson { /// Compare the Arrow JSON with a record batch reader - pub fn equals_reader(&self, reader: &mut dyn RecordBatchReader) -> bool { + pub fn equals_reader(&self, reader: &mut dyn RecordBatchReader) -> Result { if !self.schema.equals_schema(&reader.schema()) { - return false; + return Ok(false); } - self.batches.iter().all(|col| { + + for json_batch in self.get_record_batches()?.into_iter() { let batch = reader.next(); match batch { - Some(Ok(batch)) => col.equals_batch(&batch), - _ => false, + Some(Ok(batch)) => { + if json_batch != batch { + println!("json: {:?}", json_batch); + println!("batch: {:?}", batch); + return Ok(false); + } + } + _ => return Ok(false), } - }) + } + + Ok(true) + } + + pub fn get_record_batches(&self) -> Result> { + let schema = self.schema.to_arrow_schema()?; + + let mut dictionaries = HashMap::new(); + self.dictionaries.iter().for_each(|dict_batches| { + dict_batches.iter().for_each(|d| { + dictionaries.insert(d.id, d.clone()); + }); + }); + + let batches: Result> = self + .batches + .iter() + .map(|col| record_batch_from_json(&schema, col.clone(), Some(&dictionaries))) + .collect(); + + batches } } @@ -169,6 +208,28 @@ impl ArrowJsonSchema { } true } + + fn to_arrow_schema(&self) -> Result { + let arrow_fields: Result> = self + .fields + .iter() + .map(|field| field.to_arrow_field()) + .collect(); + + if let Some(metadatas) = &self.metadata { + let mut metadata: HashMap = HashMap::new(); + + metadatas.iter().for_each(|pair| { + let key = pair.get("key").unwrap(); + let value = pair.get("value").unwrap(); + metadata.insert(key.clone(), value.clone()); + }); + + Ok(Schema::new_with_metadata(arrow_fields?, metadata)) + } else { + Ok(Schema::new(arrow_fields?)) + } + } } impl ArrowJsonField { @@ -199,251 +260,731 @@ impl ArrowJsonField { } } -impl ArrowJsonBatch { - /// Compare the Arrow JSON record batch with a `RecordBatch` - fn equals_batch(&self, batch: &RecordBatch) -> bool { - if self.count != batch.num_rows() { - return false; +pub fn record_batch_from_json( + schema: &Schema, + json_batch: ArrowJsonBatch, + json_dictionaries: Option<&HashMap>, +) -> Result { + let mut columns = vec![]; + + for (field, json_col) in schema.fields().iter().zip(json_batch.columns) { + let col = array_from_json(field, json_col, json_dictionaries)?; + columns.push(col); + } + + RecordBatch::try_new(Arc::new(schema.clone()), columns) +} + +/// Construct an Arrow array from a partially typed JSON column +pub fn array_from_json( + field: &Field, + json_col: ArrowJsonColumn, + dictionaries: Option<&HashMap>, +) -> Result { + match field.data_type() { + DataType::Null => Ok(Arc::new(NullArray::new(json_col.count))), + DataType::Boolean => { + let mut b = BooleanBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_bool().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) } - let num_columns = self.columns.len(); - if num_columns != batch.num_columns() { - return false; + DataType::Int8 => { + let mut b = Int8Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_i64().ok_or_else(|| { + ArrowError::JsonError(format!( + "Unable to get {:?} as int64", + value + )) + })? as i8), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) } - let schema = batch.schema(); - self.columns - .iter() - .zip(batch.columns()) - .zip(schema.fields()) - .all(|((col, arr), field)| { - // compare each column based on its type - if &col.name != field.name() { - return false; - } - let json_array: Vec = json_from_col(col, field.data_type()); - match field.data_type() { - DataType::Null => { - let arr: &NullArray = - arr.as_any().downcast_ref::().unwrap(); - // NullArrays should have the same length, json_array is empty - arr.len() == col.count - } - DataType::Boolean => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int8 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int16 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int32 | DataType::Date32 | DataType::Time32(_) => { - let arr = Int32Array::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - let arr = Int64Array::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Interval(IntervalUnit::YearMonth) => { - let arr = IntervalYearMonthArray::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Interval(IntervalUnit::DayTime) => { - let arr = IntervalDayTimeArray::from(arr.data().clone()); - let x = json_array - .iter() - .map(|v| { - match v { - Value::Null => Value::Null, - Value::Object(v) => { - // interval has days and milliseconds - let days: i32 = - v.get("days").unwrap().as_i64().unwrap() - as i32; - let milliseconds: i32 = v - .get("milliseconds") - .unwrap() - .as_i64() - .unwrap() - as i32; - let value: i64 = unsafe { - std::mem::transmute::<[i32; 2], i64>([ - days, - milliseconds, - ]) - }; - Value::Number(VNumber::from(value)) + DataType::Int16 => { + let mut b = Int16Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_i64().unwrap() as i16), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Int32 + | DataType::Date32 + | DataType::Time32(_) + | DataType::Interval(IntervalUnit::YearMonth) => { + let mut b = Int32Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_i64().unwrap() as i32), + _ => b.append_null(), + }; + } + let array = Arc::new(b.finish()) as ArrayRef; + compute::cast(&array, field.data_type()) + } + DataType::Int64 + | DataType::Date64 + | DataType::Time64(_) + | DataType::Timestamp(_, _) + | DataType::Duration(_) + | DataType::Interval(IntervalUnit::DayTime) => { + let mut b = Int64Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(match value { + Value::Number(n) => n.as_i64().unwrap(), + Value::String(s) => { + s.parse().expect("Unable to parse string as i64") + } + Value::Object(ref map) + if map.contains_key("days") + && map.contains_key("milliseconds") => + { + match field.data_type() { + DataType::Interval(IntervalUnit::DayTime) => { + let days = map.get("days").unwrap(); + let milliseconds = map.get("milliseconds").unwrap(); + + match (days, milliseconds) { + (Value::Number(d), Value::Number(m)) => { + let mut bytes = [0_u8; 8]; + let m = (m.as_i64().unwrap() as i32) + .to_le_bytes(); + let d = (d.as_i64().unwrap() as i32) + .to_le_bytes(); + + let c = [d, m].concat(); + bytes.copy_from_slice(c.as_slice()); + i64::from_le_bytes(bytes) + } + _ => panic!( + "Unable to parse {:?} as interval daytime", + value + ), } - // return null if Value is not an object - _ => Value::Null, } - }) - .collect::>(); - arr.equals_json(&x.iter().collect::>()[..]) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let arr = IntervalMonthDayNanoArray::from(arr.data().clone()); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt8 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt16 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt32 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::UInt64 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Float32 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Float64 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Binary => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::LargeBinary => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::FixedSizeBinary(_) => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Utf8 => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::LargeUtf8 => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::List(_) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::LargeList(_) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::FixedSizeList(_, _) => { - let arr = - arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Struct(_) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Map(_, _) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Decimal(_, _) => { - let arr = arr.as_any().downcast_ref::().unwrap(); - arr.equals_json(&json_array.iter().collect::>()[..]) - } - DataType::Dictionary(ref key_type, _) => match key_type.as_ref() { - DataType::Int8 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - DataType::Int16 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - DataType::Int32 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - DataType::Int64 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) + _ => panic!( + "Unable to parse {:?} as interval daytime", + value + ), + } } - DataType::UInt8 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], + _ => panic!("Unable to parse {:?} as number", value), + }), + _ => b.append_null(), + }; + } + let array = Arc::new(b.finish()) as ArrayRef; + compute::cast(&array, field.data_type()) + } + DataType::UInt8 => { + let mut b = UInt8Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_u64().unwrap() as u8), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::UInt16 => { + let mut b = UInt16Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_u64().unwrap() as u16), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::UInt32 => { + let mut b = UInt32Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_u64().unwrap() as u32), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::UInt64 => { + let mut b = UInt64Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + if value.is_string() { + b.append_value( + value + .as_str() + .unwrap() + .parse() + .expect("Unable to parse string as u64"), ) - } - DataType::UInt16 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], + } else if value.is_number() { + b.append_value( + value.as_u64().expect("Unable to read number as u64"), ) + } else { + panic!("Unable to parse value {:?} as u64", value) } - DataType::UInt32 => { - let arr = arr - .as_any() - .downcast_ref::() - .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Interval(IntervalUnit::MonthDayNano) => { + let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(match value { + Value::Object(v) => { + let months = v.get("months").unwrap(); + let days = v.get("days").unwrap(); + let nanoseconds = v.get("nanoseconds").unwrap(); + match (months, days, nanoseconds) { + ( + Value::Number(months), + Value::Number(days), + Value::Number(nanoseconds), + ) => { + let months = months.as_i64().unwrap() as i32; + let days = days.as_i64().unwrap() as i32; + let nanoseconds = nanoseconds.as_i64().unwrap(); + let months_days_ns: i128 = ((nanoseconds as i128) + & 0xFFFFFFFFFFFFFFFF) + << 64 + | ((days as i128) & 0xFFFFFFFF) << 32 + | ((months as i128) & 0xFFFFFFFF); + months_days_ns + } + (_, _, _) => { + panic!("Unable to parse {:?} as MonthDayNano", v) + } + } } - DataType::UInt64 => { - let arr = arr - .as_any() - .downcast_ref::() + _ => panic!("Unable to parse {:?} as MonthDayNano", value), + }), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Float32 => { + let mut b = Float32Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_f64().unwrap() as f32), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Float64 => { + let mut b = Float64Builder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_f64().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Binary => { + let mut b = BinaryBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let v = decode(value.as_str().unwrap()).unwrap(); + b.append_value(&v) + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::LargeBinary => { + let mut b = LargeBinaryBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let v = decode(value.as_str().unwrap()).unwrap(); + b.append_value(&v) + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Utf8 => { + let mut b = StringBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_str().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::LargeUtf8 => { + let mut b = LargeStringBuilder::new(json_col.count); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => b.append_value(value.as_str().unwrap()), + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::FixedSizeBinary(len) => { + let mut b = FixedSizeBinaryBuilder::new(json_col.count, *len); + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let v = hex::decode(value.as_str().unwrap()).unwrap(); + b.append_value(&v)? + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::List(child_field) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| v.as_i64().unwrap() as i32) + .collect(); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .offset(0) + .add_buffer(Buffer::from(&offsets.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(ListArray::from(list_data))) + } + DataType::LargeList(child_field) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| match v { + Value::Number(n) => n.as_i64().unwrap(), + Value::String(s) => s.parse::().unwrap(), + _ => panic!("64-bit offset must be either string or number"), + }) + .collect(); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .offset(0) + .add_buffer(Buffer::from(&offsets.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(LargeListArray::from(list_data))) + } + DataType::FixedSizeList(child_field, _) => { + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let null_buf = create_null_buf(&json_col); + let list_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + Ok(Arc::new(FixedSizeListArray::from(list_data))) + } + DataType::Struct(fields) => { + // construct struct with null data + let null_buf = create_null_buf(&json_col); + let mut array_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .null_bit_buffer(Some(null_buf)); + + for (field, col) in fields.iter().zip(json_col.children.unwrap()) { + let array = array_from_json(field, col, dictionaries)?; + array_data = array_data.add_child_data(array.into_data()); + } + + let array = StructArray::from(array_data.build().unwrap()); + Ok(Arc::new(array)) + } + DataType::Dictionary(key_type, value_type) => { + let dict_id = field.dict_id().ok_or_else(|| { + ArrowError::JsonError(format!( + "Unable to find dict_id for field {:?}", + field + )) + })?; + // find dictionary + let dictionary = dictionaries + .ok_or_else(|| { + ArrowError::JsonError(format!( + "Unable to find any dictionaries for field {:?}", + field + )) + })? + .get(&dict_id); + match dictionary { + Some(dictionary) => dictionary_array_from_json( + field, + json_col, + key_type, + value_type, + dictionary, + dictionaries, + ), + None => Err(ArrowError::JsonError(format!( + "Unable to find dictionary for field {:?}", + field + ))), + } + } + DataType::Decimal128(precision, scale) => { + let mut b = Decimal128Builder::new(json_col.count, *precision, *scale); + // C++ interop tests involve incompatible decimal values + unsafe { + b.disable_value_validation(); + } + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + b.append_value(value.as_str().unwrap().parse::().unwrap())? + } + _ => b.append_null(), + }; + } + Ok(Arc::new(b.finish())) + } + DataType::Decimal256(precision, scale) => { + let mut b = Decimal256Builder::new(json_col.count, *precision, *scale); + // C++ interop tests involve incompatible decimal values + unsafe { + b.disable_value_validation(); + } + for (is_valid, value) in json_col + .validity + .as_ref() + .unwrap() + .iter() + .zip(json_col.data.unwrap()) + { + match is_valid { + 1 => { + let str = value.as_str().unwrap(); + let integer = BigInt::parse_bytes(str.as_bytes(), 10).unwrap(); + let integer_bytes = integer.to_signed_bytes_le(); + let mut bytes = if integer.is_positive() { + [0_u8; 32] + } else { + [255_u8; 32] + }; + bytes[0..integer_bytes.len()] + .copy_from_slice(integer_bytes.as_slice()); + let decimal = + Decimal256::try_new_from_bytes(*precision, *scale, &bytes) .unwrap(); - arr.equals_json( - &json_array.iter().collect::>()[..], - ) - } - t => panic!("Unsupported dictionary comparison for {:?}", t), - }, - t => panic!("Unsupported comparison for {:?}", t), + b.append_value(&decimal)?; + } + _ => b.append_null(), } - }) + } + Ok(Arc::new(b.finish())) + } + DataType::Map(child_field, _) => { + let null_buf = create_null_buf(&json_col); + let children = json_col.children.clone().unwrap(); + let child_array = array_from_json( + child_field, + children.get(0).unwrap().clone(), + dictionaries, + )?; + let offsets: Vec = json_col + .offset + .unwrap() + .iter() + .map(|v| v.as_i64().unwrap() as i32) + .collect(); + let array_data = ArrayData::builder(field.data_type().clone()) + .len(json_col.count) + .add_buffer(Buffer::from(&offsets.to_byte_slice())) + .add_child_data(child_array.into_data()) + .null_bit_buffer(Some(null_buf)) + .build() + .unwrap(); + + let array = MapArray::from(array_data); + Ok(Arc::new(array)) + } + DataType::Union(fields, field_type_ids, _) => { + let type_ids = if let Some(type_id) = json_col.type_id { + type_id + } else { + return Err(ArrowError::JsonError( + "Cannot find expected type_id in json column".to_string(), + )); + }; + + let offset: Option = json_col.offset.map(|offsets| { + let offsets: Vec = + offsets.iter().map(|v| v.as_i64().unwrap() as i32).collect(); + Buffer::from(&offsets.to_byte_slice()) + }); + + let mut children: Vec<(Field, Arc)> = vec![]; + for (field, col) in fields.iter().zip(json_col.children.unwrap()) { + let array = array_from_json(field, col, dictionaries)?; + children.push((field.clone(), array)); + } + + let array = UnionArray::try_new( + field_type_ids, + Buffer::from(&type_ids.to_byte_slice()), + offset, + children, + ) + .unwrap(); + Ok(Arc::new(array)) + } + t => Err(ArrowError::JsonError(format!( + "data type {:?} not supported", + t + ))), + } +} + +pub fn dictionary_array_from_json( + field: &Field, + json_col: ArrowJsonColumn, + dict_key: &DataType, + dict_value: &DataType, + dictionary: &ArrowJsonDictionaryBatch, + dictionaries: Option<&HashMap>, +) -> Result { + match dict_key { + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 => { + let null_buf = create_null_buf(&json_col); + + // build the key data into a buffer, then construct values separately + let key_field = Field::new_dict( + "key", + dict_key.clone(), + field.is_nullable(), + field + .dict_id() + .expect("Dictionary fields must have a dict_id value"), + field + .dict_is_ordered() + .expect("Dictionary fields must have a dict_is_ordered value"), + ); + let keys = array_from_json(&key_field, json_col, None)?; + // note: not enough info on nullability of dictionary + let value_field = Field::new("value", dict_value.clone(), true); + let values = array_from_json( + &value_field, + dictionary.data.columns[0].clone(), + dictionaries, + )?; + + // convert key and value to dictionary data + let dict_data = ArrayData::builder(field.data_type().clone()) + .len(keys.len()) + .add_buffer(keys.data().buffers()[0].clone()) + .null_bit_buffer(Some(null_buf)) + .add_child_data(values.into_data()) + .build() + .unwrap(); + + let array = match dict_key { + DataType::Int8 => { + Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef + } + DataType::Int16 => Arc::new(Int16DictionaryArray::from(dict_data)), + DataType::Int32 => Arc::new(Int32DictionaryArray::from(dict_data)), + DataType::Int64 => Arc::new(Int64DictionaryArray::from(dict_data)), + DataType::UInt8 => Arc::new(UInt8DictionaryArray::from(dict_data)), + DataType::UInt16 => Arc::new(UInt16DictionaryArray::from(dict_data)), + DataType::UInt32 => Arc::new(UInt32DictionaryArray::from(dict_data)), + DataType::UInt64 => Arc::new(UInt64DictionaryArray::from(dict_data)), + _ => unreachable!(), + }; + Ok(array) + } + _ => Err(ArrowError::JsonError(format!( + "Dictionary key type {:?} not supported", + dict_key + ))), } +} +/// A helper to create a null buffer from a Vec +fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer { + let num_bytes = bit_util::ceil(json_col.count, 8); + let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); + json_col + .validity + .clone() + .unwrap() + .iter() + .enumerate() + .for_each(|(i, v)| { + let null_slice = null_buf.as_slice_mut(); + if *v != 0 { + bit_util::set_bit(null_slice, i); + } + }); + null_buf.into() +} + +impl ArrowJsonBatch { pub fn from_batch(batch: &RecordBatch) -> ArrowJsonBatch { let mut json_batch = ArrowJsonBatch { count: batch.num_rows(), @@ -496,217 +1037,6 @@ impl ArrowJsonBatch { } } -/// Convert an Arrow JSON column/array into a vector of `Value` -fn json_from_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec { - match data_type { - DataType::List(field) => json_from_list_col(col, field.data_type()), - DataType::FixedSizeList(field, list_size) => { - json_from_fixed_size_list_col(col, field.data_type(), *list_size as usize) - } - DataType::Struct(fields) => json_from_struct_col(col, fields), - DataType::Map(field, keys_sorted) => json_from_map_col(col, field, *keys_sorted), - DataType::Int64 - | DataType::UInt64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) => { - // convert int64 data from strings to numbers - let converted_col: Vec = col - .data - .clone() - .unwrap() - .iter() - .map(|v| { - Value::Number(match v { - Value::Number(number) => number.clone(), - Value::String(string) => VNumber::from( - string - .parse::() - .expect("Unable to parse string as i64"), - ), - t => panic!("Cannot convert {} to number", t), - }) - }) - .collect(); - merge_json_array( - col.validity.as_ref().unwrap().as_slice(), - converted_col.as_slice(), - ) - } - DataType::Null => vec![], - _ => merge_json_array( - col.validity.as_ref().unwrap().as_slice(), - &col.data.clone().unwrap(), - ), - } -} - -/// Merge VALIDITY and DATA vectors from a primitive data type into a `Value` vector with nulls -fn merge_json_array(validity: &[u8], data: &[Value]) -> Vec { - validity - .iter() - .zip(data) - .map(|(v, d)| match v { - 0 => Value::Null, - 1 => d.clone(), - _ => panic!("Validity data should be 0 or 1"), - }) - .collect() -} - -/// Convert an Arrow JSON column/array of a `DataType::Struct` into a vector of `Value` -fn json_from_struct_col(col: &ArrowJsonColumn, fields: &[Field]) -> Vec { - let mut values = Vec::with_capacity(col.count); - - let children: Vec> = col - .children - .clone() - .unwrap() - .iter() - .zip(fields) - .map(|(child, field)| json_from_col(child, field.data_type())) - .collect(); - - // create a struct from children - for j in 0..col.count { - let mut map = serde_json::map::Map::new(); - for i in 0..children.len() { - map.insert(fields[i].name().to_string(), children[i][j].clone()); - } - values.push(Value::Object(map)); - } - - values -} - -/// Convert an Arrow JSON column/array of a `DataType::List` into a vector of `Value` -fn json_from_list_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec { - let mut values = Vec::with_capacity(col.count); - - // get the inner array - let child = &col.children.clone().expect("list type must have children")[0]; - let offsets: Vec = col - .offset - .clone() - .unwrap() - .iter() - .map(|o| match o { - Value::String(s) => s.parse::().unwrap(), - Value::Number(n) => n.as_u64().unwrap() as usize, - _ => panic!( - "Offsets should be numbers or strings that are convertible to numbers" - ), - }) - .collect(); - let inner = match data_type { - DataType::List(ref field) => json_from_col(child, field.data_type()), - DataType::Struct(fields) => json_from_struct_col(col, fields), - _ => merge_json_array( - child.validity.as_ref().unwrap().as_slice(), - &child.data.clone().unwrap(), - ), - }; - - for i in 0..col.count { - match &col.validity { - Some(validity) => match &validity[i] { - 0 => values.push(Value::Null), - 1 => { - values.push(Value::Array(inner[offsets[i]..offsets[i + 1]].to_vec())) - } - _ => panic!("Validity data should be 0 or 1"), - }, - None => { - // Null type does not have a validity vector - } - } - } - - values -} - -/// Convert an Arrow JSON column/array of a `DataType::List` into a vector of `Value` -fn json_from_fixed_size_list_col( - col: &ArrowJsonColumn, - data_type: &DataType, - list_size: usize, -) -> Vec { - let mut values = Vec::with_capacity(col.count); - - // get the inner array - let child = &col.children.clone().expect("list type must have children")[0]; - let inner = match data_type { - DataType::List(ref field) => json_from_col(child, field.data_type()), - DataType::FixedSizeList(ref field, _) => json_from_col(child, field.data_type()), - DataType::Struct(fields) => json_from_struct_col(col, fields), - _ => merge_json_array( - child.validity.as_ref().unwrap().as_slice(), - &child.data.clone().unwrap(), - ), - }; - - for i in 0..col.count { - match &col.validity { - Some(validity) => match &validity[i] { - 0 => values.push(Value::Null), - 1 => values.push(Value::Array( - inner[(list_size * i)..(list_size * (i + 1))].to_vec(), - )), - _ => panic!("Validity data should be 0 or 1"), - }, - None => {} - } - } - - values -} - -fn json_from_map_col( - col: &ArrowJsonColumn, - field: &Field, - _keys_sorted: bool, -) -> Vec { - let mut values = Vec::with_capacity(col.count); - - // get the inner array - let child = &col.children.clone().expect("list type must have children")[0]; - let offsets: Vec = col - .offset - .clone() - .unwrap() - .iter() - .map(|o| match o { - Value::String(s) => s.parse::().unwrap(), - Value::Number(n) => n.as_u64().unwrap() as usize, - _ => panic!( - "Offsets should be numbers or strings that are convertible to numbers" - ), - }) - .collect(); - - let inner = match field.data_type() { - DataType::Struct(fields) => json_from_struct_col(child, fields), - _ => panic!("Map child must be Struct"), - }; - - for i in 0..col.count { - match &col.validity { - Some(validity) => match &validity[i] { - 0 => values.push(Value::Null), - 1 => { - values.push(Value::Array(inner[offsets[i]..offsets[i + 1]].to_vec())) - } - _ => panic!("Validity data should be 0 or 1"), - }, - None => { - // Null type does not have a validity vector - } - } - } - - values -} #[cfg(test)] mod tests { use super::*; @@ -945,22 +1275,25 @@ mod tests { .len(3) .add_buffer(value_offsets) .add_child_data(value_data.into_data()) + .null_bit_buffer(Some(Buffer::from([0b00000011]))) .build() .unwrap(); let lists = ListArray::from(list_data); let structs_int32s = Int32Array::from(vec![None, Some(-2), None]); let structs_utf8s = StringArray::from(vec![None, None, Some("aaaaaa")]); - let structs = StructArray::from(vec![ - ( - Field::new("int32s", DataType::Int32, true), - Arc::new(structs_int32s) as ArrayRef, - ), - ( - Field::new("utf8s", DataType::Utf8, true), - Arc::new(structs_utf8s) as ArrayRef, - ), + let struct_data_type = DataType::Struct(vec![ + Field::new("int32s", DataType::Int32, true), + Field::new("utf8s", DataType::Utf8, true), ]); + let struct_data = ArrayData::builder(struct_data_type) + .len(3) + .add_child_data(structs_int32s.data().clone()) + .add_child_data(structs_utf8s.data().clone()) + .null_bit_buffer(Some(Buffer::from([0b00000011]))) + .build() + .unwrap(); + let structs = StructArray::from(struct_data); let record_batch = RecordBatch::try_new( Arc::new(schema.clone()), @@ -1005,6 +1338,6 @@ mod tests { // test schemas assert!(arrow_json.schema.equals_schema(&schema)); // test record batch - assert!(arrow_json.batches[0].equals_batch(&record_batch)); + assert_eq!(arrow_json.get_record_batches().unwrap()[0], record_batch); } } diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 86253da8d777..1ee05d8a02c7 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -24,13 +24,13 @@ pub mod bit_util; #[cfg(feature = "test_utils")] pub mod data_gen; pub mod display; -#[cfg(feature = "test_utils")] +#[cfg(any(test, feature = "test_utils"))] pub mod integration_util; #[cfg(feature = "prettyprint")] pub mod pretty; pub(crate) mod serialization; pub mod string_writer; -#[cfg(feature = "test_utils")] +#[cfg(any(test, feature = "test_utils"))] pub mod test_util; mod trusted_len; diff --git a/arrow/src/util/pretty.rs b/arrow/src/util/pretty.rs index e92b0366ae1e..6f4d9e34a99b 100644 --- a/arrow/src/util/pretty.rs +++ b/arrow/src/util/pretty.rs @@ -19,9 +19,8 @@ //! available unless `feature = "prettyprint"` is enabled. use crate::{array::ArrayRef, record_batch::RecordBatch}; -use std::fmt::Display; - use comfy_table::{Cell, Table}; +use std::fmt::Display; use crate::error::Result; diff --git a/arrow/tests/schema.rs b/arrow/tests/schema.rs new file mode 100644 index 000000000000..ff544b68937b --- /dev/null +++ b/arrow/tests/schema.rs @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, Schema}; +use std::collections::HashMap; +/// The tests in this file ensure a `Schema` can be manipulated +/// outside of the arrow crate + +#[test] +fn schema_destructure() { + let meta = [("foo".to_string(), "baz".to_string())] + .into_iter() + .collect::>(); + + let field = Field::new("c1", DataType::Utf8, false); + let schema = Schema::new(vec![field]).with_metadata(meta); + + // Destructuring a Schema allows rewriting fields and metadata + // without copying + // + // Model this usecase below: + + let Schema { + mut fields, + metadata, + } = schema; + fields.push(Field::new("c2", DataType::Utf8, false)); + + let new_schema = Schema::new(fields).with_metadata(metadata); + + assert_eq!(new_schema.fields().len(), 2); +} diff --git a/dev/release/README.md b/dev/release/README.md index 592d4c39fab9..4ffa85d2abaa 100644 --- a/dev/release/README.md +++ b/dev/release/README.md @@ -21,10 +21,23 @@ ## Overview -We try to release a new version of Arrow every two weeks. This cadence balances getting new features into arrow without overwhelming downstream projects with too frequent changes. +This file documents the release process for: + +1. The "Rust Arrow Crates": `arrow`, `arrow-flight`, `parquet`, and `parquet-derive`. +2. The `object_store` crate. + +### The Rust Arrow Crates + +The Rust Arrow Crates are interconnected (e.g. `parquet` has an optional dependency on `arrow`) so we increment and release all of them together. We try to release a new version of "Rust Arrow Crates" every two weeks. This cadence balances getting new features into the community without overwhelming downstream projects with too frequent changes or overly burdening maintainers. If any code has been merged to master that has a breaking API change, as defined in [Rust RFC 1105](https://github.com/rust-lang/rfcs/blob/master/text/1105-api-evolution.md), the major version number incremented changed (e.g. `9.0.2` to `9.0.2`). Otherwise the new minor version incremented (e.g. `9.0.2` to `7.1.0`). +### `object_store` crate + +At the time of writing, we release a new version of `object_store` on demand rather than on a regular schedule. + +As we are still in an early phase, we use the 0.x version scheme. If any code has been merged to master that has a breaking API change, as defined in [Rust RFC 1105](https://github.com/rust-lang/rfcs/blob/master/text/1105-api-evolution.md), the minor version number incremented changed (e.g. `0.3.0` to `0.4.0`). Otherwise the patch version is incremented (e.g. `0.3.0` to `0.3.1`). + # Release Mechanics ## Process Overview @@ -47,13 +60,17 @@ labels associated with them. Now prepare a PR to update `CHANGELOG.md` and versions on `master` to reflect the planned release. -See [#1141](https://github.com/apache/arrow-rs/pull/1141) for an example. +For the Rust Arrow crates, do this in the root of this repository. For example [#2323](https://github.com/apache/arrow-rs/pull/2323) + +For `object_store` the same process is done in the `object_store` directory. Examples TBD ```bash git checkout master git pull git checkout -b make-release +# Move the content of CHANGELOG.md to CHANGELOG-old.md + # manully edit ./dev/release/update_change_log.sh to reflect the release version # create the changelog CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh @@ -61,7 +78,7 @@ CHANGELOG_GITHUB_TOKEN= ./dev/release/update_change_log.sh git commit -a -m 'Create changelog' # update versions -sed -i '' -e 's/14.0.0/19.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` +sed -i '' -e 's/14.0.0/20.0.0/g' `find . -name 'Cargo.toml' -or -name '*.md' | grep -v CHANGELOG.md` git commit -a -m 'Update version' ``` @@ -82,7 +99,11 @@ distribution servers. While the official release artifact is a signed tarball, we also tag the commit it was created for convenience and code archaeology. -Using a string such as `4.0.1` as the ``, create and push the tag thusly: +For a Rust Arrow Crates release, use a string such as `4.0.1` as the ``. + +For `object_store` releases, use a string such as `object_store_0.4.0` as the ``. + +Create and push the tag thusly: ```shell git fetch apache @@ -97,12 +118,20 @@ Pick numbers in sequential order, with `1` for `rc1`, `2` for `rc2`, etc. ### Create, sign, and upload tarball -Run `create-tarball.sh` with the `` tag and `` and you found in previous steps: +Run `create-tarball.sh` with the `` tag and `` and you found in previous steps. + +Rust Arrow Crates: ```shell ./dev/release/create-tarball.sh 4.1.0 2 ``` +`object_store`: + +```shell +./object_store/dev/release/create-tarball.sh 4.1.0 2 +``` + The `create-tarball.sh` script 1. creates and uploads a release candidate tarball to the [arrow @@ -114,7 +143,7 @@ The `create-tarball.sh` script ### Vote on Release Candidate tarball -Send the email output from the script to dev@arrow.apache.org. The email should look like +Send an email, based on the output from the script to dev@arrow.apache.org. The email should look like ``` To: dev@arrow.apache.org @@ -144,11 +173,11 @@ The vote will be open for at least 72 hours. [3]: https://github.com/apache/arrow-rs/blob/a5dd428f57e62db20a945e8b1895de91405958c4/CHANGELOG.md ``` -For the release to become "official" it needs at least three PMC members to vote +1 on it. +For the release to become "official" it needs at least three Apache Arrow PMC members to vote +1 on it. ## Verifying release candidates -The `dev/release/verify-release-candidate.sh` is a script in this repository that can assist in the verification process. Run it like: +The `dev/release/verify-release-candidate.sh` or `object_store/dev/release/verify-release-candidate.sh` are scripts in this repository that can assist in the verification process. Run it like: ``` ./dev/release/verify-release-candidate.sh 4.1.0 2 @@ -162,10 +191,18 @@ If the release is not approved, fix whatever the problem is and try again with t Move tarball to the release location in SVN, e.g. https://dist.apache.org/repos/dist/release/arrow/arrow-4.1.0/, using the `release-tarball.sh` script: +Rust Arrow Crates: + ```shell ./dev/release/release-tarball.sh 4.1.0 2 ``` +`object_store` + +```shell +./object_store/dev/release/release-tarball.sh 4.1.0 2 +``` + Congratulations! The release is now offical! ### Publish on Crates.io @@ -188,9 +225,17 @@ Verify that the Cargo.toml in the tarball contains the correct version (e.g. `version = "0.11.0"`) and then publish the crate with the following commands +Rust Arrow Crates: + ```shell (cd arrow && cargo publish) (cd arrow-flight && cargo publish) (cd parquet && cargo publish) (cd parquet_derive && cargo publish) ``` + +`object_store` + +```shell +cargo publish +``` diff --git a/dev/release/update_change_log.sh b/dev/release/update_change_log.sh index dc3d9e4e4a2d..b2ca561e073d 100755 --- a/dev/release/update_change_log.sh +++ b/dev/release/update_change_log.sh @@ -29,8 +29,8 @@ set -e -SINCE_TAG="18.0.0" -FUTURE_RELEASE="19.0.0" +SINCE_TAG="19.0.0" +FUTURE_RELEASE="20.0.0" SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" @@ -40,6 +40,8 @@ OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" # remove license header so github-changelog-generator has a clean base to append sed -i.bak '1,18d' "${OUTPUT_PATH}" +# use exclude-tags-regex to filter out tags used for object_store +# crates and only only look at tags that DO NOT begin with `object_store_` pushd "${SOURCE_TOP_DIR}" docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ --user apache \ @@ -48,6 +50,7 @@ docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pw --cache-log=.githubchangeloggenerator.cache.log \ --http-cache \ --max-issues=300 \ + --exclude-tags-regex "^object_store_\d+\.\d+\.\d+$" \ --since-tag ${SINCE_TAG} \ --future-release ${FUTURE_RELEASE} diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index a5ed04c6f8b8..b60465b9732c 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -72,24 +72,6 @@ fetch_archive() { ${sha512_verify} ${dist_name}.tar.gz.sha512 } -verify_dir_artifact_signatures() { - # verify the signature and the checksums of each artifact - find $1 -name '*.asc' | while read sigfile; do - artifact=${sigfile/.asc/} - gpg --verify $sigfile $artifact || exit 1 - - # go into the directory because the checksum files contain only the - # basename of the artifact - pushd $(dirname $artifact) - base_artifact=$(basename $artifact) - if [ -f $base_artifact.sha256 ]; then - ${sha256_verify} $base_artifact.sha256 || exit 1 - fi - ${sha512_verify} $base_artifact.sha512 || exit 1 - popd - done -} - setup_tempdir() { cleanup() { if [ "${TEST_SUCCESS}" = "yes" ]; then diff --git a/integration-testing/Cargo.toml b/integration-testing/Cargo.toml index 5d98dc9eb963..12892badcd27 100644 --- a/integration-testing/Cargo.toml +++ b/integration-testing/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "arrow-integration-testing" description = "Binaries used in the Arrow integration tests" -version = "19.0.0" +version = "20.0.0" homepage = "https://github.com/apache/arrow-rs" repository = "https://github.com/apache/arrow-rs" authors = ["Apache Arrow "] @@ -37,11 +37,11 @@ async-trait = { version = "0.1.41", default-features = false } clap = { version = "3", default-features = false, features = ["std", "derive"] } futures = { version = "0.3", default-features = false } hex = { version = "0.4", default-features = false } -prost = { version = "0.10", default-features = false } +prost = { version = "0.11", default-features = false } serde = { version = "1.0", default-features = false, features = ["rc"] } serde_derive = { version = "1.0", default-features = false } serde_json = { version = "1.0", default-features = false, features = ["std"] } tokio = { version = "1.0", default-features = false } -tonic = { version = "0.7", default-features = false } +tonic = { version = "0.8", default-features = false } tracing-subscriber = { version = "0.3.1", default-features = false, features = ["fmt"], optional = true } num = { version = "0.4", default-features = false, features = ["std"] } diff --git a/integration-testing/src/bin/arrow-json-integration-test.rs b/integration-testing/src/bin/arrow-json-integration-test.rs index 69b73b19f222..b442e8b5ed30 100644 --- a/integration-testing/src/bin/arrow-json-integration-test.rs +++ b/integration-testing/src/bin/arrow-json-integration-test.rs @@ -91,7 +91,10 @@ fn arrow_to_json(arrow_name: &str, json_name: &str, verbose: bool) -> Result<()> for f in reader.schema().fields() { fields.push(ArrowJsonField::from(f)); } - let schema = ArrowJsonSchema { fields }; + let schema = ArrowJsonSchema { + fields, + metadata: None, + }; let batches = reader .map(|batch| Ok(ArrowJsonBatch::from_batch(&batch?))) diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index 781416e67521..5d3da15d3f50 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -17,29 +17,17 @@ //! Common code used in the integration test binaries -use hex::decode; use serde_json::Value; use arrow::util::integration_util::ArrowJsonBatch; -use arrow::array::*; -use arrow::datatypes::{DataType, Field, IntervalUnit, Schema}; -use arrow::error::{ArrowError, Result}; +use arrow::datatypes::Schema; +use arrow::error::Result; use arrow::record_batch::RecordBatch; -use arrow::{ - buffer::Buffer, - buffer::MutableBuffer, - datatypes::ToByteSlice, - util::{bit_util, integration_util::*}, -}; - -use arrow::util::decimal::{BasicDecimal, Decimal256}; -use num::bigint::BigInt; -use num::Signed; +use arrow::util::integration_util::*; use std::collections::HashMap; use std::fs::File; use std::io::BufReader; -use std::sync::Arc; /// The expected username for the basic auth integration test. pub const AUTH_USERNAME: &str = "arrow"; @@ -88,713 +76,3 @@ pub fn read_json_file(json_name: &str) -> Result { batches, }) } - -fn record_batch_from_json( - schema: &Schema, - json_batch: ArrowJsonBatch, - json_dictionaries: Option<&HashMap>, -) -> Result { - let mut columns = vec![]; - - for (field, json_col) in schema.fields().iter().zip(json_batch.columns) { - let col = array_from_json(field, json_col, json_dictionaries)?; - columns.push(col); - } - - RecordBatch::try_new(Arc::new(schema.clone()), columns) -} - -/// Construct an Arrow array from a partially typed JSON column -fn array_from_json( - field: &Field, - json_col: ArrowJsonColumn, - dictionaries: Option<&HashMap>, -) -> Result { - match field.data_type() { - DataType::Null => Ok(Arc::new(NullArray::new(json_col.count))), - DataType::Boolean => { - let mut b = BooleanBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_bool().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Int8 => { - let mut b = Int8Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_i64().ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to get {:?} as int64", - value - )) - })? as i8), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Int16 => { - let mut b = Int16Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_i64().unwrap() as i16), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Int32 - | DataType::Date32 - | DataType::Time32(_) - | DataType::Interval(IntervalUnit::YearMonth) => { - let mut b = Int32Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_i64().unwrap() as i32), - _ => b.append_null(), - }; - } - let array = Arc::new(b.finish()) as ArrayRef; - arrow::compute::cast(&array, field.data_type()) - } - DataType::Int64 - | DataType::Date64 - | DataType::Time64(_) - | DataType::Timestamp(_, _) - | DataType::Duration(_) - | DataType::Interval(IntervalUnit::DayTime) => { - let mut b = Int64Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(match value { - Value::Number(n) => n.as_i64().unwrap(), - Value::String(s) => { - s.parse().expect("Unable to parse string as i64") - } - Value::Object(ref map) - if map.contains_key("days") - && map.contains_key("milliseconds") => - { - match field.data_type() { - DataType::Interval(IntervalUnit::DayTime) => { - let days = map.get("days").unwrap(); - let milliseconds = map.get("milliseconds").unwrap(); - - match (days, milliseconds) { - (Value::Number(d), Value::Number(m)) => { - let mut bytes = [0_u8; 8]; - let m = (m.as_i64().unwrap() as i32) - .to_le_bytes(); - let d = (d.as_i64().unwrap() as i32) - .to_le_bytes(); - - let c = [d, m].concat(); - bytes.copy_from_slice(c.as_slice()); - i64::from_le_bytes(bytes) - } - _ => panic!( - "Unable to parse {:?} as interval daytime", - value - ), - } - } - _ => panic!( - "Unable to parse {:?} as interval daytime", - value - ), - } - } - _ => panic!("Unable to parse {:?} as number", value), - }), - _ => b.append_null(), - }; - } - let array = Arc::new(b.finish()) as ArrayRef; - arrow::compute::cast(&array, field.data_type()) - } - DataType::UInt8 => { - let mut b = UInt8Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_u64().unwrap() as u8), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::UInt16 => { - let mut b = UInt16Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_u64().unwrap() as u16), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::UInt32 => { - let mut b = UInt32Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_u64().unwrap() as u32), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::UInt64 => { - let mut b = UInt64Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value( - value - .as_str() - .unwrap() - .parse() - .expect("Unable to parse string as u64"), - ), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Interval(IntervalUnit::MonthDayNano) => { - let mut b = IntervalMonthDayNanoBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(match value { - Value::Object(v) => { - let months = v.get("months").unwrap(); - let days = v.get("days").unwrap(); - let nanoseconds = v.get("nanoseconds").unwrap(); - match (months, days, nanoseconds) { - ( - Value::Number(months), - Value::Number(days), - Value::Number(nanoseconds), - ) => { - let months = months.as_i64().unwrap() as i32; - let days = days.as_i64().unwrap() as i32; - let nanoseconds = nanoseconds.as_i64().unwrap(); - let months_days_ns: i128 = ((nanoseconds as i128) - & 0xFFFFFFFFFFFFFFFF) - << 64 - | ((days as i128) & 0xFFFFFFFF) << 32 - | ((months as i128) & 0xFFFFFFFF); - months_days_ns - } - (_, _, _) => { - panic!("Unable to parse {:?} as MonthDayNano", v) - } - } - } - _ => panic!("Unable to parse {:?} as MonthDayNano", value), - }), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Float32 => { - let mut b = Float32Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_f64().unwrap() as f32), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Float64 => { - let mut b = Float64Builder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_f64().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Binary => { - let mut b = BinaryBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let v = decode(value.as_str().unwrap()).unwrap(); - b.append_value(&v) - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::LargeBinary => { - let mut b = LargeBinaryBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let v = decode(value.as_str().unwrap()).unwrap(); - b.append_value(&v) - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Utf8 => { - let mut b = StringBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_str().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::LargeUtf8 => { - let mut b = LargeStringBuilder::new(json_col.count); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => b.append_value(value.as_str().unwrap()), - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::FixedSizeBinary(len) => { - let mut b = FixedSizeBinaryBuilder::new(json_col.count, *len); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let v = hex::decode(value.as_str().unwrap()).unwrap(); - b.append_value(&v)? - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::List(child_field) => { - let null_buf = create_null_buf(&json_col); - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let offsets: Vec = json_col - .offset - .unwrap() - .iter() - .map(|v| v.as_i64().unwrap() as i32) - .collect(); - let list_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .offset(0) - .add_buffer(Buffer::from(&offsets.to_byte_slice())) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - Ok(Arc::new(ListArray::from(list_data))) - } - DataType::LargeList(child_field) => { - let null_buf = create_null_buf(&json_col); - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let offsets: Vec = json_col - .offset - .unwrap() - .iter() - .map(|v| match v { - Value::Number(n) => n.as_i64().unwrap(), - Value::String(s) => s.parse::().unwrap(), - _ => panic!("64-bit offset must be either string or number"), - }) - .collect(); - let list_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .offset(0) - .add_buffer(Buffer::from(&offsets.to_byte_slice())) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - Ok(Arc::new(LargeListArray::from(list_data))) - } - DataType::FixedSizeList(child_field, _) => { - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let null_buf = create_null_buf(&json_col); - let list_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - Ok(Arc::new(FixedSizeListArray::from(list_data))) - } - DataType::Struct(fields) => { - // construct struct with null data - let null_buf = create_null_buf(&json_col); - let mut array_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .null_bit_buffer(Some(null_buf)); - - for (field, col) in fields.iter().zip(json_col.children.unwrap()) { - let array = array_from_json(field, col, dictionaries)?; - array_data = array_data.add_child_data(array.into_data()); - } - - let array = StructArray::from(array_data.build().unwrap()); - Ok(Arc::new(array)) - } - DataType::Dictionary(key_type, value_type) => { - let dict_id = field.dict_id().ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to find dict_id for field {:?}", - field - )) - })?; - // find dictionary - let dictionary = dictionaries - .ok_or_else(|| { - ArrowError::JsonError(format!( - "Unable to find any dictionaries for field {:?}", - field - )) - })? - .get(&dict_id); - match dictionary { - Some(dictionary) => dictionary_array_from_json( - field, - json_col, - key_type, - value_type, - dictionary, - dictionaries, - ), - None => Err(ArrowError::JsonError(format!( - "Unable to find dictionary for field {:?}", - field - ))), - } - } - DataType::Decimal(precision, scale) => { - let mut b = Decimal128Builder::new(json_col.count, *precision, *scale); - // C++ interop tests involve incompatible decimal values - unsafe { - b.disable_value_validation(); - } - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - b.append_value(value.as_str().unwrap().parse::().unwrap())? - } - _ => b.append_null(), - }; - } - Ok(Arc::new(b.finish())) - } - DataType::Decimal256(precision, scale) => { - let mut b = Decimal256Builder::new(json_col.count, *precision, *scale); - for (is_valid, value) in json_col - .validity - .as_ref() - .unwrap() - .iter() - .zip(json_col.data.unwrap()) - { - match is_valid { - 1 => { - let str = value.as_str().unwrap(); - let integer = BigInt::parse_bytes(str.as_bytes(), 10).unwrap(); - let integer_bytes = integer.to_signed_bytes_le(); - let mut bytes = if integer.is_positive() { - [0_u8; 32] - } else { - [255_u8; 32] - }; - bytes[0..integer_bytes.len()] - .copy_from_slice(integer_bytes.as_slice()); - let decimal = - Decimal256::try_new_from_bytes(*precision, *scale, &bytes) - .unwrap(); - b.append_value(&decimal)?; - } - _ => b.append_null(), - } - } - Ok(Arc::new(b.finish())) - } - DataType::Map(child_field, _) => { - let null_buf = create_null_buf(&json_col); - let children = json_col.children.clone().unwrap(); - let child_array = array_from_json( - child_field, - children.get(0).unwrap().clone(), - dictionaries, - )?; - let offsets: Vec = json_col - .offset - .unwrap() - .iter() - .map(|v| v.as_i64().unwrap() as i32) - .collect(); - let array_data = ArrayData::builder(field.data_type().clone()) - .len(json_col.count) - .add_buffer(Buffer::from(&offsets.to_byte_slice())) - .add_child_data(child_array.into_data()) - .null_bit_buffer(Some(null_buf)) - .build() - .unwrap(); - - let array = MapArray::from(array_data); - Ok(Arc::new(array)) - } - DataType::Union(fields, field_type_ids, _) => { - let type_ids = if let Some(type_id) = json_col.type_id { - type_id - } else { - return Err(ArrowError::JsonError( - "Cannot find expected type_id in json column".to_string(), - )); - }; - - let offset: Option = json_col.offset.map(|offsets| { - let offsets: Vec = - offsets.iter().map(|v| v.as_i64().unwrap() as i32).collect(); - Buffer::from(&offsets.to_byte_slice()) - }); - - let mut children: Vec<(Field, Arc)> = vec![]; - for (field, col) in fields.iter().zip(json_col.children.unwrap()) { - let array = array_from_json(field, col, dictionaries)?; - children.push((field.clone(), array)); - } - - let array = UnionArray::try_new( - field_type_ids, - Buffer::from(&type_ids.to_byte_slice()), - offset, - children, - ) - .unwrap(); - Ok(Arc::new(array)) - } - t => Err(ArrowError::JsonError(format!( - "data type {:?} not supported", - t - ))), - } -} - -fn dictionary_array_from_json( - field: &Field, - json_col: ArrowJsonColumn, - dict_key: &DataType, - dict_value: &DataType, - dictionary: &ArrowJsonDictionaryBatch, - dictionaries: Option<&HashMap>, -) -> Result { - match dict_key { - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 => { - let null_buf = create_null_buf(&json_col); - - // build the key data into a buffer, then construct values separately - let key_field = Field::new_dict( - "key", - dict_key.clone(), - field.is_nullable(), - field - .dict_id() - .expect("Dictionary fields must have a dict_id value"), - field - .dict_is_ordered() - .expect("Dictionary fields must have a dict_is_ordered value"), - ); - let keys = array_from_json(&key_field, json_col, None)?; - // note: not enough info on nullability of dictionary - let value_field = Field::new("value", dict_value.clone(), true); - let values = array_from_json( - &value_field, - dictionary.data.columns[0].clone(), - dictionaries, - )?; - - // convert key and value to dictionary data - let dict_data = ArrayData::builder(field.data_type().clone()) - .len(keys.len()) - .add_buffer(keys.data().buffers()[0].clone()) - .null_bit_buffer(Some(null_buf)) - .add_child_data(values.into_data()) - .build() - .unwrap(); - - let array = match dict_key { - DataType::Int8 => { - Arc::new(Int8DictionaryArray::from(dict_data)) as ArrayRef - } - DataType::Int16 => Arc::new(Int16DictionaryArray::from(dict_data)), - DataType::Int32 => Arc::new(Int32DictionaryArray::from(dict_data)), - DataType::Int64 => Arc::new(Int64DictionaryArray::from(dict_data)), - DataType::UInt8 => Arc::new(UInt8DictionaryArray::from(dict_data)), - DataType::UInt16 => Arc::new(UInt16DictionaryArray::from(dict_data)), - DataType::UInt32 => Arc::new(UInt32DictionaryArray::from(dict_data)), - DataType::UInt64 => Arc::new(UInt64DictionaryArray::from(dict_data)), - _ => unreachable!(), - }; - Ok(array) - } - _ => Err(ArrowError::JsonError(format!( - "Dictionary key type {:?} not supported", - dict_key - ))), - } -} - -/// A helper to create a null buffer from a Vec -fn create_null_buf(json_col: &ArrowJsonColumn) -> Buffer { - let num_bytes = bit_util::ceil(json_col.count, 8); - let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, false); - json_col - .validity - .clone() - .unwrap() - .iter() - .enumerate() - .for_each(|(i, v)| { - let null_slice = null_buf.as_slice_mut(); - if *v != 0 { - bit_util::set_bit(null_slice, i); - } - }); - null_buf.into() -} diff --git a/object_store/.circleci/config.yml b/object_store/.circleci/config.yml deleted file mode 100644 index b4dff6d53acc..000000000000 --- a/object_store/.circleci/config.yml +++ /dev/null @@ -1,262 +0,0 @@ ---- -# CI Overview -# ----------- -# -# Each night: -# -# A build image is created (ci_image) from `docker/Dockerfile.ci` and is -# pushed to `quay.io/influxdb/rust:ci`. This build image is then used to run -# the CI tasks for the day. -# -# Every commit: -# -# The CI for every PR and merge to main runs tests, fmt, lints and compiles debug binaries -# -# On main if all these checks pass it will then additionally compile in "release" mode and -# publish a docker image to quay.io/influxdb/iox:$COMMIT_SHA -# -# Manual CI Image: -# -# It is possible to manually trigger a rebuild of the image used in CI. To do this, navigate to -# https://app.circleci.com/pipelines/github/influxdata/influxdb_iox?branch=main (overriding the -# branch name if desired). Then: -# - Click "Run Pipeline" in the top-right -# - Expand "Add Parameters" -# - Add a "boolean" parameter called "ci_image" with the value true -# - Click "Run Pipeline" -# -# If you refresh the page you should see a newly running ci_image workflow -# - -version: 2.1 - -orbs: - win: circleci/windows@4.1 - -commands: - rust_components: - description: Verify installed components - steps: - - run: - name: Verify installed components - command: | - rustup --version - rustup show - cargo fmt --version - cargo clippy --version - - cache_restore: - description: Restore Cargo Cache - steps: - - restore_cache: - name: Restoring Cargo Cache - keys: - - cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} - - cargo-cache-{{ arch }}-{{ .Branch }} - - cargo-cache - cache_save: - description: Save Cargo Cache - steps: - - save_cache: - name: Save Cargo Cache - paths: - - /usr/local/cargo/registry - key: cargo-cache-{{ arch }}-{{ .Branch }}-{{ checksum "Cargo.lock" }} - -jobs: - fmt: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Rust fmt - command: cargo fmt --all -- --check - - cache_save - lint: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Clippy - command: cargo clippy --all-targets --all-features --workspace -- -D warnings - - cache_save - cargo_audit: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Install cargo-deny - command: cargo install --force cargo-deny - - run: - name: cargo-deny Checks - command: cargo deny check -s - - cache_save - check: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Install cargo-hack - command: cargo install cargo-hack - - run: - name: Check all features - command: cargo hack check --feature-powerset --no-dev-deps --workspace - - cache_save - doc: - docker: - - image: quay.io/influxdb/rust:ci - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - rust_components - - cache_restore - - run: - name: Cargo doc - # excluding datafusion because it's effectively a dependency masqueraded as workspace crate. - command: cargo doc --document-private-items --no-deps --workspace --exclude datafusion - - cache_save - - run: - name: Compress Docs - command: tar -cvzf rustdoc.tar.gz target/doc/ - - store_artifacts: - path: rustdoc.tar.gz - test: - # setup multiple docker images (see https://circleci.com/docs/2.0/configuration-reference/#docker) - docker: - - image: quay.io/influxdb/rust:ci - - image: localstack/localstack:0.14.4 - - image: mcr.microsoft.com/azure-storage/azurite - - image: fsouza/fake-gcs-server - command: - - "-scheme" - - "http" - resource_class: 2xlarge # use of a smaller executor tends crashes on link - environment: - # Disable incremental compilation to avoid overhead. We are not preserving these files anyway. - CARGO_INCREMENTAL: "0" - # Disable full debug symbol generation to speed up CI build - # "1" means line tables only, which is useful for panic tracebacks. - RUSTFLAGS: "-C debuginfo=1" - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - RUST_BACKTRACE: "1" - # Run integration tests - TEST_INTEGRATION: 1 - AWS_DEFAULT_REGION: "us-east-1" - AWS_ACCESS_KEY_ID: test - AWS_SECRET_ACCESS_KEY: test - AWS_ENDPOINT: http://127.0.0.1:4566 - AZURE_USE_EMULATOR: "1" - GOOGLE_SERVICE_ACCOUNT: "/tmp/gcs.json" - OBJECT_STORE_BUCKET: test-bucket - steps: - - run: - name: Setup localstack (AWS emulation) - command: | - cd /tmp - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" - unzip awscliv2.zip - sudo ./aws/install - aws --endpoint-url=http://localhost:4566 s3 mb s3://test-bucket - - run: - name: Setup Azurite (Azure emulation) - # the magical connection string is from https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azurite?tabs=visual-studio#http-connection-strings - command: | - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - az storage container create -n test-bucket --connection-string 'DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=http://127.0.0.1:10001/devstoreaccount1;' - - run: - name: Setup fake GCS server - command: | - curl -X POST --data-binary '{"name":"test-bucket"}' -H "Content-Type: application/json" "http://localhost:4443/storage/v1/b" - echo '{"gcs_base_url": "http://localhost:4443", "disable_oauth": true, "client_email": "", "private_key": ""}' > "$GOOGLE_SERVICE_ACCOUNT" - - checkout - - rust_components - - cache_restore - - run: - name: Cargo test - command: cargo test --workspace --features=aws,azure,azure_test,gcp - - cache_save - - test_windows: - executor: - name: win/default - size: medium - environment: - # https://github.com/rust-lang/cargo/issues/10280 - CARGO_NET_GIT_FETCH_WITH_CLI: "true" - steps: - - checkout - - run: - name: Download rustup - command: wget https://win.rustup.rs/x86_64 -O rustup-init.exe - - run: - name: Install rustup - command: .\rustup-init.exe -y --default-host=x86_64-pc-windows-msvc - - run: - name: Cargo test - command: cargo test --workspace - -workflows: - version: 2 - - # CI for all pull requests. - ci: - jobs: - - check - - fmt - - lint - - cargo_audit - - test - - test_windows - - doc diff --git a/object_store/.github_changelog_generator b/object_store/.github_changelog_generator new file mode 100644 index 000000000000..cbd8aa0c4b48 --- /dev/null +++ b/object_store/.github_changelog_generator @@ -0,0 +1,27 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Add special sections for documentation, security and performance +add-sections={"documentation":{"prefix":"**Documentation updates:**","labels":["documentation"]},"security":{"prefix":"**Security updates:**","labels":["security"]},"performance":{"prefix":"**Performance improvements:**","labels":["performance"]}} +# so that the component is shown associated with the issue +issue-line-labels=object-store +# skip non object_store issues +exclude-labels=development-process,invalid,arrow,parquet,arrow-flight +breaking_labels=api-change diff --git a/object_store/CHANGELOG.md b/object_store/CHANGELOG.md new file mode 100644 index 000000000000..93faa678ffa8 --- /dev/null +++ b/object_store/CHANGELOG.md @@ -0,0 +1,70 @@ + + +# Changelog + +## [object_store_0.4.0](https://github.com/apache/arrow-rs/tree/object_store_0.4.0) (2022-08-10) + +[Full Changelog](https://github.com/apache/arrow-rs/compare/object_store_0.3.0...object_store_0.4.0) + +**Implemented enhancements:** + +- Relax Path Validation to Allow Any Percent-Encoded Sequence [\#2355](https://github.com/apache/arrow-rs/issues/2355) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Support get\_multi\_ranges in ObjectStore [\#2293](https://github.com/apache/arrow-rs/issues/2293) +- object\_store: Create explicit test for symlinks [\#2206](https://github.com/apache/arrow-rs/issues/2206) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Make builder style configuration for object stores [\#2203](https://github.com/apache/arrow-rs/issues/2203) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- object\_store: Add example in the main documentation readme [\#2202](https://github.com/apache/arrow-rs/issues/2202) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Fixed bugs:** + +- Azure/S3 Storage Fails to Copy Blob with URL-encoded Path [\#2353](https://github.com/apache/arrow-rs/issues/2353) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] +- Accessing a file with a percent-encoded name on the filesystem with ObjectStore LocalFileSystem [\#2349](https://github.com/apache/arrow-rs/issues/2349) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] + +**Documentation updates:** + +- Improve `object_store crate` documentation [\#2260](https://github.com/apache/arrow-rs/pull/2260) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) + +**Merged pull requests:** + +- Canonicalize filesystem paths in user-facing APIs \(\#2370\) [\#2371](https://github.com/apache/arrow-rs/pull/2371) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix object\_store lint [\#2367](https://github.com/apache/arrow-rs/pull/2367) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Relax path validation \(\#2355\) [\#2356](https://github.com/apache/arrow-rs/pull/2356) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Fix Copy from percent-encoded path \(\#2353\) [\#2354](https://github.com/apache/arrow-rs/pull/2354) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add ObjectStore::get\_ranges \(\#2293\) [\#2336](https://github.com/apache/arrow-rs/pull/2336) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Remove vestigal ` object_store/.circleci/` [\#2337](https://github.com/apache/arrow-rs/pull/2337) ([alamb](https://github.com/alamb)) +- Handle symlinks in LocalFileSystem \(\#2206\) [\#2269](https://github.com/apache/arrow-rs/pull/2269) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Retry GCP requests on server error [\#2243](https://github.com/apache/arrow-rs/pull/2243) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Add LimitStore \(\#2175\) [\#2242](https://github.com/apache/arrow-rs/pull/2242) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([tustvold](https://github.com/tustvold)) +- Only trigger `arrow` CI on changes to arrow [\#2227](https://github.com/apache/arrow-rs/pull/2227) ([alamb](https://github.com/alamb)) +- Update instructions on how to join the Slack channel [\#2219](https://github.com/apache/arrow-rs/pull/2219) ([HaoYang670](https://github.com/HaoYang670)) +- Add Builder style config objects for object\_store [\#2204](https://github.com/apache/arrow-rs/pull/2204) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Ignore broken symlinks for LocalFileSystem object store [\#2195](https://github.com/apache/arrow-rs/pull/2195) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([jccampagne](https://github.com/jccampagne)) +- Change CI names to match crate names [\#2189](https://github.com/apache/arrow-rs/pull/2189) ([alamb](https://github.com/alamb)) +- Split most arrow specific CI checks into their own workflows \(reduce common CI time to 21 minutes\) [\#2168](https://github.com/apache/arrow-rs/pull/2168) ([alamb](https://github.com/alamb)) +- Remove another attempt to cache target directory in action.yaml [\#2167](https://github.com/apache/arrow-rs/pull/2167) ([alamb](https://github.com/alamb)) +- Run actions on push to master, pull requests [\#2166](https://github.com/apache/arrow-rs/pull/2166) ([alamb](https://github.com/alamb)) +- Break parquet\_derive and arrow\_flight tests into their own workflows [\#2165](https://github.com/apache/arrow-rs/pull/2165) ([alamb](https://github.com/alamb)) +- Only run integration tests when `arrow` changes [\#2152](https://github.com/apache/arrow-rs/pull/2152) ([alamb](https://github.com/alamb)) +- Break out docs CI job to its own github action [\#2151](https://github.com/apache/arrow-rs/pull/2151) ([alamb](https://github.com/alamb)) +- Do not pretend to cache rust build artifacts, speed up CI by ~20% [\#2150](https://github.com/apache/arrow-rs/pull/2150) ([alamb](https://github.com/alamb)) +- Port `object_store` integration tests, use github actions [\#2148](https://github.com/apache/arrow-rs/pull/2148) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Port Add stream upload \(multi-part upload\) [\#2147](https://github.com/apache/arrow-rs/pull/2147) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([alamb](https://github.com/alamb)) +- Increase upper wait time to reduce flakyness of object store test [\#2142](https://github.com/apache/arrow-rs/pull/2142) [[object-store](https://github.com/apache/arrow-rs/labels/object-store)] ([viirya](https://github.com/viirya)) + +\* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* diff --git a/object_store/Cargo.toml b/object_store/Cargo.toml index 741539891597..ffb65aaa7ee7 100644 --- a/object_store/Cargo.toml +++ b/object_store/Cargo.toml @@ -17,11 +17,11 @@ [package] name = "object_store" -version = "0.3.0" +version = "0.4.0" edition = "2021" license = "MIT/Apache-2.0" readme = "README.md" -description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage and Azure Blob Storage" +description = "A generic object store interface for uniformly interacting with AWS S3, Google Cloud Storage, Azure Blob Storage and local files." keywords = [ "object", "storage", @@ -46,8 +46,9 @@ serde = { version = "1.0", default-features = false, features = ["derive"], opti serde_json = { version = "1.0", default-features = false, optional = true } quick-xml = { version = "0.23.0", features = ["serialize"], optional = true } rustls-pemfile = { version = "1.0", default-features = false, optional = true } -ring = { version = "0.16", default-features = false, features = ["std"] } +ring = { version = "0.16", default-features = false, features = ["std"], optional = true } base64 = { version = "0.13", default-features = false, optional = true } +rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } # for rusoto hyper = { version = "0.14", optional = true, default-features = false } # for rusoto @@ -58,11 +59,11 @@ percent-encoding = "2.1" rusoto_core = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } rusoto_credential = { version = "0.48.0", optional = true, default-features = false } rusoto_s3 = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } -rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } +rusoto_sts = { version = "0.48.0", optional = true, default-features = false, features = ["rustls"] } snafu = "0.7" tokio = { version = "1.18", features = ["sync", "macros", "parking_lot", "rt-multi-thread", "time", "io-util"] } tracing = { version = "0.1" } -reqwest = { version = "0.11", optional = true, default-features = false, features = ["rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"], optional = true } parking_lot = { version = "0.12" } # Filesystem integration url = "2.2" @@ -71,10 +72,10 @@ walkdir = "2" [features] azure = ["azure_core", "azure_storage_blobs", "azure_storage", "reqwest"] azure_test = ["azure", "azure_core/azurite_workaround", "azure_storage/azurite_workaround", "azure_storage_blobs/azurite_workaround"] -gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64"] +gcp = ["serde", "serde_json", "quick-xml", "reqwest", "reqwest/json", "reqwest/stream", "chrono/serde", "rustls-pemfile", "base64", "rand", "ring"] aws = ["rusoto_core", "rusoto_credential", "rusoto_s3", "rusoto_sts", "hyper", "hyper-rustls"] [dev-dependencies] # In alphabetical order dotenv = "0.15.0" tempfile = "3.1.0" -futures-test = "0.3" +futures-test = "0.3" \ No newline at end of file diff --git a/object_store/README.md b/object_store/README.md index 313588b4a73b..fd10414a9285 100644 --- a/object_store/README.md +++ b/object_store/README.md @@ -19,8 +19,21 @@ # Rust Object Store -A crate providing a generic interface to object stores, such as S3, Azure Blob Storage and Google Cloud Storage. +A focused, easy to use, idiomatic, high performance, `async` object +store library interacting with object stores. -Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to Apache Arrow. +Using this crate, the same binary and code can easily run in multiple +clouds and local test environments, via a simple runtime configuration +change. Supported object stores include: + +* [AWS S3](https://aws.amazon.com/s3/) +* [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) +* [Google Cloud Storage](https://cloud.google.com/storage) +* Local files +* Memory +* Custom implementations + + +Originally developed for [InfluxDB IOx](https://github.com/influxdata/influxdb_iox/) and later split out and donated to [Apache Arrow](https://arrow.apache.org/). See [docs.rs](https://docs.rs/object_store) for usage instructions diff --git a/object_store/dev/release/README.md b/object_store/dev/release/README.md new file mode 100644 index 000000000000..89f6e579b23d --- /dev/null +++ b/object_store/dev/release/README.md @@ -0,0 +1,20 @@ + + +See instructions in [`/dev/release/README.md`](../../../dev/release/README.md) diff --git a/object_store/dev/release/create-tarball.sh b/object_store/dev/release/create-tarball.sh new file mode 100755 index 000000000000..bbffde89b043 --- /dev/null +++ b/object_store/dev/release/create-tarball.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script creates a signed tarball in +# dev/dist/apache-arrow-object-store-rs--.tar.gz and uploads it to +# the "dev" area of the dist.apache.arrow repository and prepares an +# email for sending to the dev@arrow.apache.org list for a formal +# vote. +# +# Note the tags are expected to be `object_sore_` +# +# See release/README.md for full release instructions +# +# Requirements: +# +# 1. gpg setup for signing and have uploaded your public +# signature to https://pgp.mit.edu/ +# +# 2. Logged into the apache svn server with the appropriate +# credentials +# +# +# Based in part on 02-source.sh from apache/arrow +# + +set -e + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 0.4.0 1" + exit +fi + +object_store_version=$1 +rc=$2 + +tag=object_store_${object_store_version} + +release=apache-arrow-object-store-rs-${object_store_version} +distdir=${SOURCE_TOP_DIR}/dev/dist/${release}-rc${rc} +tarname=${release}.tar.gz +tarball=${distdir}/${tarname} +url="https://dist.apache.org/repos/dist/dev/arrow/${release}-rc${rc}" + +echo "Attempting to create ${tarball} from tag ${tag}" + +release_hash=$(cd "${SOURCE_TOP_DIR}" && git rev-list --max-count=1 ${tag}) + +if [ -z "$release_hash" ]; then + echo "Cannot continue: unknown git tag: $tag" +fi + +echo "Draft email for dev@arrow.apache.org mailing list" +echo "" +echo "---------------------------------------------------------" +cat < containing the files in git at $release_hash +# the files in the tarball are prefixed with {object_store_version=} (e.g. 0.4.0) +mkdir -p ${distdir} +(cd "${SOURCE_TOP_DIR}" && git archive ${release_hash} --prefix ${release}/ | gzip > ${tarball}) + +echo "Running rat license checker on ${tarball}" +${SOURCE_DIR}/../../../dev/release/run-rat.sh ${tarball} + +echo "Signing tarball and creating checksums" +gpg --armor --output ${tarball}.asc --detach-sig ${tarball} +# create signing with relative path of tarball +# so that they can be verified with a command such as +# shasum --check apache-arrow-rs-4.1.0-rc2.tar.gz.sha512 +(cd ${distdir} && shasum -a 256 ${tarname}) > ${tarball}.sha256 +(cd ${distdir} && shasum -a 512 ${tarname}) > ${tarball}.sha512 + +echo "Uploading to apache dist/dev to ${url}" +svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow ${SOURCE_TOP_DIR}/dev/dist +svn add ${distdir} +svn ci -m "Apache Arrow Rust ${object_store_version=} ${rc}" ${distdir} diff --git a/object_store/dev/release/release-tarball.sh b/object_store/dev/release/release-tarball.sh new file mode 100755 index 000000000000..75ff886c6b1e --- /dev/null +++ b/object_store/dev/release/release-tarball.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# This script copies a tarball from the "dev" area of the +# dist.apache.arrow repository to the "release" area +# +# This script should only be run after the release has been approved +# by the arrow PMC committee. +# +# See release/README.md for full release instructions +# +# Based in part on post-01-upload.sh from apache/arrow + + +set -e +set -u + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + echo "ex. $0 0.4.0 1" + exit +fi + +version=$1 +rc=$2 + +tmp_dir=tmp-apache-arrow-dist + +echo "Recreate temporary directory: ${tmp_dir}" +rm -rf ${tmp_dir} +mkdir -p ${tmp_dir} + +echo "Clone dev dist repository" +svn \ + co \ + https://dist.apache.org/repos/dist/dev/arrow/apache-arrow-object-store-rs-${version}-rc${rc} \ + ${tmp_dir}/dev + +echo "Clone release dist repository" +svn co https://dist.apache.org/repos/dist/release/arrow ${tmp_dir}/release + +echo "Copy ${version}-rc${rc} to release working copy" +release_version=arrow-object-store-rs-${version} +mkdir -p ${tmp_dir}/release/${release_version} +cp -r ${tmp_dir}/dev/* ${tmp_dir}/release/${release_version}/ +svn add ${tmp_dir}/release/${release_version} + +echo "Commit release" +svn ci -m "Apache Arrow Rust Object Store ${version}" ${tmp_dir}/release + +echo "Clean up" +rm -rf ${tmp_dir} + +echo "Success!" +echo "The release is available here:" +echo " https://dist.apache.org/repos/dist/release/arrow/${release_version}" diff --git a/object_store/dev/release/update_change_log.sh b/object_store/dev/release/update_change_log.sh new file mode 100755 index 000000000000..ebd50df7ffc0 --- /dev/null +++ b/object_store/dev/release/update_change_log.sh @@ -0,0 +1,79 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# invokes the changelog generator from +# https://github.com/github-changelog-generator/github-changelog-generator +# +# With the config located in +# arrow-rs/object_store/.github_changelog_generator +# +# Usage: +# CHANGELOG_GITHUB_TOKEN= ./update_change_log.sh + +set -e + +SINCE_TAG="object_store_0.3.0" +FUTURE_RELEASE="object_store_0.4.0" + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SOURCE_TOP_DIR="$(cd "${SOURCE_DIR}/../../" && pwd)" + +OUTPUT_PATH="${SOURCE_TOP_DIR}/CHANGELOG.md" + +# remove license header so github-changelog-generator has a clean base to append +sed -i.bak '1,18d' "${OUTPUT_PATH}" + +# use exclude-tags-regex to filter out tags used for arrow +# crates and only look at tags that begin with `object_store_` +pushd "${SOURCE_TOP_DIR}" +docker run -it --rm -e CHANGELOG_GITHUB_TOKEN="$CHANGELOG_GITHUB_TOKEN" -v "$(pwd)":/usr/local/src/your-app githubchangeloggenerator/github-changelog-generator \ + --user apache \ + --project arrow-rs \ + --cache-file=.githubchangeloggenerator.cache \ + --cache-log=.githubchangeloggenerator.cache.log \ + --http-cache \ + --max-issues=300 \ + --exclude-tags-regex "^\d+\.\d+\.\d+$" \ + --since-tag ${SINCE_TAG} \ + --future-release ${FUTURE_RELEASE} + +sed -i.bak "s/\\\n/\n\n/" "${OUTPUT_PATH}" + +# Put license header back on +echo ' +' | cat - "${OUTPUT_PATH}" > "${OUTPUT_PATH}".tmp +mv "${OUTPUT_PATH}".tmp "${OUTPUT_PATH}" diff --git a/object_store/dev/release/verify-release-candidate.sh b/object_store/dev/release/verify-release-candidate.sh new file mode 100755 index 000000000000..06a5d8bcb838 --- /dev/null +++ b/object_store/dev/release/verify-release-candidate.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +case $# in + 2) VERSION="$1" + RC_NUMBER="$2" + ;; + *) echo "Usage: $0 X.Y.Z RC_NUMBER" + exit 1 + ;; +esac + +set -e +set -x +set -o pipefail + +SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)" +ARROW_DIR="$(dirname $(dirname ${SOURCE_DIR}))" +ARROW_DIST_URL='https://dist.apache.org/repos/dist/dev/arrow' + +download_dist_file() { + curl \ + --silent \ + --show-error \ + --fail \ + --location \ + --remote-name $ARROW_DIST_URL/$1 +} + +download_rc_file() { + download_dist_file apache-arrow-object-store-rs-${VERSION}-rc${RC_NUMBER}/$1 +} + +import_gpg_keys() { + download_dist_file KEYS + gpg --import KEYS +} + +if type shasum >/dev/null 2>&1; then + sha256_verify="shasum -a 256 -c" + sha512_verify="shasum -a 512 -c" +else + sha256_verify="sha256sum -c" + sha512_verify="sha512sum -c" +fi + +fetch_archive() { + local dist_name=$1 + download_rc_file ${dist_name}.tar.gz + download_rc_file ${dist_name}.tar.gz.asc + download_rc_file ${dist_name}.tar.gz.sha256 + download_rc_file ${dist_name}.tar.gz.sha512 + gpg --verify ${dist_name}.tar.gz.asc ${dist_name}.tar.gz + ${sha256_verify} ${dist_name}.tar.gz.sha256 + ${sha512_verify} ${dist_name}.tar.gz.sha512 +} + +setup_tempdir() { + cleanup() { + if [ "${TEST_SUCCESS}" = "yes" ]; then + rm -fr "${ARROW_TMPDIR}" + else + echo "Failed to verify release candidate. See ${ARROW_TMPDIR} for details." + fi + } + + if [ -z "${ARROW_TMPDIR}" ]; then + # clean up automatically if ARROW_TMPDIR is not defined + ARROW_TMPDIR=$(mktemp -d -t "$1.XXXXX") + trap cleanup EXIT + else + # don't clean up automatically + mkdir -p "${ARROW_TMPDIR}" + fi +} + +test_source_distribution() { + # install rust toolchain in a similar fashion like test-miniconda + export RUSTUP_HOME=$PWD/test-rustup + export CARGO_HOME=$PWD/test-rustup + + curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path + + export PATH=$RUSTUP_HOME/bin:$PATH + source $RUSTUP_HOME/env + + # build and test rust + cargo build + cargo test --all + + # verify that the crate can be published to crates.io + cargo publish --dry-run +} + +TEST_SUCCESS=no + +setup_tempdir "arrow-${VERSION}" +echo "Working in sandbox ${ARROW_TMPDIR}" +cd ${ARROW_TMPDIR} + +dist_name="apache-arrow-object-store-rs-${VERSION}" +import_gpg_keys +fetch_archive ${dist_name} +tar xf ${dist_name}.tar.gz +pushd ${dist_name} +test_source_distribution +popd + +TEST_SUCCESS=yes +echo 'Release candidate looks good!' +exit 0 diff --git a/object_store/src/aws.rs b/object_store/src/aws.rs index 3606a3806f99..bcb294c00373 100644 --- a/object_store/src/aws.rs +++ b/object_store/src/aws.rs @@ -48,6 +48,7 @@ use futures::{ Future, Stream, StreamExt, TryStreamExt, }; use hyper::client::Builder as HyperBuilder; +use percent_encoding::{percent_encode, AsciiSet, NON_ALPHANUMERIC}; use rusoto_core::ByteStream; use rusoto_credential::{InstanceMetadataProvider, StaticProvider}; use rusoto_s3::S3; @@ -62,6 +63,17 @@ use tokio::io::AsyncWrite; use tokio::sync::{OwnedSemaphorePermit, Semaphore}; use tracing::{debug, warn}; +// Do not URI-encode any of the unreserved characters that RFC 3986 defines: +// A-Z, a-z, 0-9, hyphen ( - ), underscore ( _ ), period ( . ), and tilde ( ~ ). +const STRICT_ENCODE_SET: AsciiSet = NON_ALPHANUMERIC + .remove(b'-') + .remove(b'.') + .remove(b'_') + .remove(b'~'); + +/// This struct is used to maintain the URI path encoding +const STRICT_PATH_ENCODE_SET: AsciiSet = STRICT_ENCODE_SET.remove(b'/'); + /// The maximum number of times a request will be retried in the case of an AWS server error pub const MAX_NUM_RETRIES: u32 = 3; @@ -228,6 +240,14 @@ enum Error { source: rusoto_core::region::ParseRegionError, }, + #[snafu(display( + "Region must be specified for AWS S3. Regions should look like `us-east-2`" + ))] + MissingRegion {}, + + #[snafu(display("Missing bucket name"))] + MissingBucketName {}, + #[snafu(display("Missing aws-access-key"))] MissingAccessKey, @@ -252,7 +272,7 @@ impl From for super::Error { } } -/// Configuration for connecting to [Amazon S3](https://aws.amazon.com/s3/). +/// Interface for [Amazon S3](https://aws.amazon.com/s3/). pub struct AmazonS3 { /// S3 client w/o any connection limit. /// @@ -533,9 +553,15 @@ impl ObjectStore for AmazonS3 { let to = to.as_ref(); let bucket_name = self.bucket_name.clone(); + let copy_source = format!( + "{}/{}", + &bucket_name, + percent_encode(from.as_ref(), &STRICT_PATH_ENCODE_SET) + ); + let request_factory = move || rusoto_s3::CopyObjectRequest { bucket: bucket_name.clone(), - copy_source: format!("{}/{}", &bucket_name, from), + copy_source, key: to.to_string(), ..Default::default() }; @@ -584,99 +610,197 @@ fn convert_object_meta(object: rusoto_s3::Object, bucket: &str) -> Result>, - secret_access_key: Option>, - region: impl Into, - bucket_name: impl Into, - endpoint: Option>, - session_token: Option>, +/// +/// # Example +/// ``` +/// # let REGION = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY_ID = "foo"; +/// # let SECRET_KEY = "foo"; +/// # use object_store::aws::AmazonS3Builder; +/// let s3 = AmazonS3Builder::new() +/// .with_region(REGION) +/// .with_bucket_name(BUCKET_NAME) +/// .with_access_key_id(ACCESS_KEY_ID) +/// .with_secret_access_key(SECRET_KEY) +/// .build(); +/// ``` +#[derive(Debug)] +pub struct AmazonS3Builder { + access_key_id: Option, + secret_access_key: Option, + region: Option, + bucket_name: Option, + endpoint: Option, + token: Option, max_connections: NonZeroUsize, allow_http: bool, -) -> Result { - let region = region.into(); - let region: rusoto_core::Region = match endpoint { - None => region.parse().context(InvalidRegionSnafu { region })?, - Some(endpoint) => rusoto_core::Region::Custom { - name: region, - endpoint: endpoint.into(), - }, - }; +} - let mut builder = HyperBuilder::default(); - builder.pool_max_idle_per_host(max_connections.get()); - - let connector = if allow_http { - hyper_rustls::HttpsConnectorBuilder::new() - .with_webpki_roots() - .https_or_http() - .enable_http1() - .enable_http2() - .build() - } else { - hyper_rustls::HttpsConnectorBuilder::new() - .with_webpki_roots() - .https_only() - .enable_http1() - .enable_http2() - .build() - }; +impl Default for AmazonS3Builder { + fn default() -> Self { + Self { + access_key_id: None, + secret_access_key: None, + region: None, + bucket_name: None, + endpoint: None, + token: None, + max_connections: NonZeroUsize::new(16).unwrap(), + allow_http: false, + } + } +} - let http_client = rusoto_core::request::HttpClient::from_builder(builder, connector); +impl AmazonS3Builder { + /// Create a new [`AmazonS3Builder`] with default values. + pub fn new() -> Self { + Default::default() + } - let client = match (access_key_id, secret_access_key, session_token) { - (Some(access_key_id), Some(secret_access_key), Some(session_token)) => { - let credentials_provider = StaticProvider::new( - access_key_id.into(), - secret_access_key.into(), - Some(session_token.into()), - None, - ); - rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) - } - (Some(access_key_id), Some(secret_access_key), None) => { - let credentials_provider = StaticProvider::new_minimal( - access_key_id.into(), - secret_access_key.into(), - ); - rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) - } - (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), - (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), - _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { - rusoto_s3::S3Client::new_with( - http_client, - WebIdentityProvider::from_k8s_env(), - region, - ) - } - _ => rusoto_s3::S3Client::new_with( - http_client, - InstanceMetadataProvider::new(), + /// Set the AWS Access Key (required) + pub fn with_access_key_id(mut self, access_key_id: impl Into) -> Self { + self.access_key_id = Some(access_key_id.into()); + self + } + + /// Set the AWS Secret Access Key (required) + pub fn with_secret_access_key( + mut self, + secret_access_key: impl Into, + ) -> Self { + self.secret_access_key = Some(secret_access_key.into()); + self + } + + /// Set the region (e.g. `us-east-1`) (required) + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Set the bucket_name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } + + /// Sets the endpoint for communicating with AWS S3. Default value + /// is based on region. + /// + /// For example, this might be set to `"http://localhost:4566:` + /// for testing against a localstack instance. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = Some(endpoint.into()); + self + } + + /// Set the token to use for requests (passed to underlying provider) + pub fn with_token(mut self, token: impl Into) -> Self { + self.token = Some(token.into()); + self + } + + /// Sets the maximum number of concurrent outstanding + /// connectons. Default is `16`. + #[deprecated(note = "use LimitStore instead")] + pub fn with_max_connections(mut self, max_connections: NonZeroUsize) -> Self { + self.max_connections = max_connections; + self + } + + /// Sets what protocol is allowed. If `allow_http` is : + /// * false (default): Only HTTPS are allowed + /// * true: HTTP and HTTPS are allowed + pub fn with_allow_http(mut self, allow_http: bool) -> Self { + self.allow_http = allow_http; + self + } + + /// Create a [`AmazonS3`] instance from the provided values, + /// consuming `self`. + pub fn build(self) -> Result { + let Self { + access_key_id, + secret_access_key, region, - ), - }; + bucket_name, + endpoint, + token, + max_connections, + allow_http, + } = self; + + let region = region.ok_or(Error::MissingRegion {})?; + let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; + + let region: rusoto_core::Region = match endpoint { + None => region.parse().context(InvalidRegionSnafu { region })?, + Some(endpoint) => rusoto_core::Region::Custom { + name: region, + endpoint, + }, + }; - Ok(AmazonS3 { - client_unrestricted: client, - connection_semaphore: Arc::new(Semaphore::new(max_connections.get())), - bucket_name: bucket_name.into(), - }) -} + let mut builder = HyperBuilder::default(); + builder.pool_max_idle_per_host(max_connections.get()); + + let connector = if allow_http { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_or_http() + .enable_http1() + .enable_http2() + .build() + } else { + hyper_rustls::HttpsConnectorBuilder::new() + .with_webpki_roots() + .https_only() + .enable_http1() + .enable_http2() + .build() + }; + + let http_client = + rusoto_core::request::HttpClient::from_builder(builder, connector); + + let client = match (access_key_id, secret_access_key, token) { + (Some(access_key_id), Some(secret_access_key), Some(token)) => { + let credentials_provider = StaticProvider::new( + access_key_id, + secret_access_key, + Some(token), + None, + ); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (Some(access_key_id), Some(secret_access_key), None) => { + let credentials_provider = + StaticProvider::new_minimal(access_key_id, secret_access_key); + rusoto_s3::S3Client::new_with(http_client, credentials_provider, region) + } + (None, Some(_), _) => return Err(Error::MissingAccessKey.into()), + (Some(_), None, _) => return Err(Error::MissingSecretAccessKey.into()), + _ if std::env::var_os("AWS_WEB_IDENTITY_TOKEN_FILE").is_some() => { + rusoto_s3::S3Client::new_with( + http_client, + WebIdentityProvider::from_k8s_env(), + region, + ) + } + _ => rusoto_s3::S3Client::new_with( + http_client, + InstanceMetadataProvider::new(), + region, + ), + }; -/// Create a new [`AmazonS3`] that always errors -pub fn new_failing_s3() -> Result { - new_s3( - Some("foo"), - Some("bar"), - "us-east-1", - "bucket", - None as Option<&str>, - None as Option<&str>, - NonZeroUsize::new(16).unwrap(), - true, - ) + Ok(AmazonS3 { + client_unrestricted: client, + connection_semaphore: Arc::new(Semaphore::new(max_connections.get())), + bucket_name, + }) + } } /// S3 client bundled w/ a semaphore permit. @@ -903,34 +1027,6 @@ where } } -impl Error { - #[cfg(test)] - fn s3_error_due_to_credentials(&self) -> bool { - use rusoto_core::RusotoError; - use Error::*; - - matches!( - self, - UnableToPutData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToGetData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToDeleteData { - source: RusotoError::Credentials(_), - bucket: _, - path: _, - } | UnableToListData { - source: RusotoError::Credentials(_), - bucket: _, - } - ) - } -} - struct S3MultiPartUpload { bucket: String, key: String, @@ -1057,27 +1153,16 @@ mod tests { get_nonexistent_object, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, rename_and_copy, stream_get, }, - Error as ObjectStoreError, ObjectStore, + Error as ObjectStoreError, }; use bytes::Bytes; use std::env; - type TestError = Box; - type Result = std::result::Result; - const NON_EXISTENT_NAME: &str = "nonexistentname"; - #[derive(Debug)] - struct AwsConfig { - access_key_id: String, - secret_access_key: String, - region: String, - bucket: String, - endpoint: Option, - token: Option, - } - - // Helper macro to skip tests if TEST_INTEGRATION and the AWS environment variables are not set. + // Helper macro to skip tests if TEST_INTEGRATION and the AWS + // environment variables are not set. Returns a configured + // AmazonS3Builder macro_rules! maybe_skip_integration { () => {{ dotenv::dotenv().ok(); @@ -1116,68 +1201,58 @@ mod tests { ); return; } else { - AwsConfig { - access_key_id: env::var("AWS_ACCESS_KEY_ID") - .expect("already checked AWS_ACCESS_KEY_ID"), - secret_access_key: env::var("AWS_SECRET_ACCESS_KEY") - .expect("already checked AWS_SECRET_ACCESS_KEY"), - region: env::var("AWS_DEFAULT_REGION") - .expect("already checked AWS_DEFAULT_REGION"), - bucket: env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - endpoint: env::var("AWS_ENDPOINT").ok(), - token: env::var("AWS_SESSION_TOKEN").ok(), - } - } - }}; - } - - fn check_credentials(r: Result) -> Result { - if let Err(e) = &r { - let e = &**e; - if let Some(e) = e.downcast_ref::() { - if e.s3_error_due_to_credentials() { - eprintln!( - "Try setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY \ - environment variables" - ); - } - } - } + let config = AmazonS3Builder::new() + .with_access_key_id( + env::var("AWS_ACCESS_KEY_ID") + .expect("already checked AWS_ACCESS_KEY_ID"), + ) + .with_secret_access_key( + env::var("AWS_SECRET_ACCESS_KEY") + .expect("already checked AWS_SECRET_ACCESS_KEY"), + ) + .with_region( + env::var("AWS_DEFAULT_REGION") + .expect("already checked AWS_DEFAULT_REGION"), + ) + .with_bucket_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + ) + .with_allow_http(true); + + let config = if let Some(endpoint) = env::var("AWS_ENDPOINT").ok() { + config.with_endpoint(endpoint) + } else { + config + }; - r - } + let config = if let Some(token) = env::var("AWS_SESSION_TOKEN").ok() { + config.with_token(token) + } else { + config + }; - fn make_integration(config: AwsConfig) -> AmazonS3 { - new_s3( - Some(config.access_key_id), - Some(config.secret_access_key), - config.region, - config.bucket, - config.endpoint, - config.token, - NonZeroUsize::new(16).unwrap(), - true, - ) - .expect("Valid S3 config") + config + } + }}; } #[tokio::test] async fn s3_test() { let config = maybe_skip_integration!(); - let integration = make_integration(config); + let integration = config.build().unwrap(); - check_credentials(put_get_delete_list(&integration).await).unwrap(); - check_credentials(list_uses_directories_correctly(&integration).await).unwrap(); - check_credentials(list_with_delimiter(&integration).await).unwrap(); - check_credentials(rename_and_copy(&integration).await).unwrap(); - check_credentials(stream_get(&integration).await).unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + stream_get(&integration).await; } #[tokio::test] async fn s3_test_get_nonexistent_location() { let config = maybe_skip_integration!(); - let integration = make_integration(config); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1204,9 +1279,8 @@ mod tests { #[tokio::test] async fn s3_test_get_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = make_integration(config); + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1220,9 +1294,9 @@ mod tests { #[tokio::test] async fn s3_test_put_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = make_integration(config); + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); let data = Bytes::from("arbitrary data"); @@ -1244,7 +1318,7 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_location() { let config = maybe_skip_integration!(); - let integration = make_integration(config); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1253,9 +1327,8 @@ mod tests { #[tokio::test] async fn s3_test_delete_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = make_integration(config); + let config = maybe_skip_integration!().with_bucket_name(NON_EXISTENT_NAME); + let integration = config.build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); diff --git a/object_store/src/azure.rs b/object_store/src/azure.rs index 25f311a9a39d..6a5f537997cd 100644 --- a/object_store/src/azure.rs +++ b/object_store/src/azure.rs @@ -185,6 +185,15 @@ enum Error { env_value: String, source: url::ParseError, }, + + #[snafu(display("Account must be specified"))] + MissingAccount {}, + + #[snafu(display("Access key must be specified"))] + MissingAccessKey {}, + + #[snafu(display("Container name must be specified"))] + MissingContainerName {}, } impl From for super::Error { @@ -200,7 +209,7 @@ impl From for super::Error { } } -/// Configuration for connecting to [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). +/// Interface for [Microsoft Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/). #[derive(Debug)] pub struct MicrosoftAzure { container_client: Arc, @@ -461,14 +470,15 @@ impl ObjectStore for MicrosoftAzure { impl MicrosoftAzure { /// helper function to create a source url for copy function - fn get_copy_from_url(&self, from: &Path) -> Result { - Ok(reqwest::Url::parse(&format!( - "{}/{}/{}", - &self.blob_base_url, self.container_name, from - )) - .context(UnableToParseUrlSnafu { - container: &self.container_name, - })?) + fn get_copy_from_url(&self, from: &Path) -> Result { + let mut url = + Url::parse(&format!("{}/{}", &self.blob_base_url, self.container_name)) + .context(UnableToParseUrlSnafu { + container: &self.container_name, + })?; + + url.path_segments_mut().unwrap().extend(from.parts()); + Ok(url) } async fn list_impl( @@ -570,73 +580,126 @@ fn url_from_env(env_name: &str, default_url: &str) -> Result { Ok(url) } -/// Configure a connection to container with given name on Microsoft Azure -/// Blob store. +/// Configure a connection to Mirosoft Azure Blob Storage bucket using +/// the specified credentials. /// -/// The credentials `account` and `access_key` must provide access to the -/// store. -pub fn new_azure( - account: impl Into, - access_key: impl Into, - container_name: impl Into, +/// # Example +/// ``` +/// # let ACCOUNT = "foo"; +/// # let BUCKET_NAME = "foo"; +/// # let ACCESS_KEY = "foo"; +/// # use object_store::azure::MicrosoftAzureBuilder; +/// let azure = MicrosoftAzureBuilder::new() +/// .with_account(ACCOUNT) +/// .with_access_key(ACCESS_KEY) +/// .with_container_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Debug, Default)] +pub struct MicrosoftAzureBuilder { + account: Option, + access_key: Option, + container_name: Option, use_emulator: bool, -) -> Result { - let account = account.into(); - let access_key = access_key.into(); - let http_client: Arc = Arc::new(reqwest::Client::new()); - - let (is_emulator, storage_account_client) = if use_emulator { - check_if_emulator_works()?; - // Allow overriding defaults. Values taken from - // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 - let http_client = azure_core::new_http_client(); - let blob_storage_url = - url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; - let queue_storage_url = - url_from_env("AZURITE_QUEUE_STORAGE_URL", "http://127.0.0.1:10001")?; - let table_storage_url = - url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10002")?; - let filesystem_url = - url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10004")?; - - let storage_client = StorageAccountClient::new_emulator( - http_client, - &blob_storage_url, - &table_storage_url, - &queue_storage_url, - &filesystem_url, - ); - - (true, storage_client) - } else { - ( - false, - StorageAccountClient::new_access_key( - Arc::clone(&http_client), - &account, - &access_key, - ), - ) - }; +} - let storage_client = storage_account_client.as_storage_client(); - let blob_base_url = storage_account_client - .blob_storage_url() - .as_ref() - // make url ending consistent between the emulator and remote storage account - .trim_end_matches('/') - .to_string(); +impl MicrosoftAzureBuilder { + /// Create a new [`MicrosoftAzureBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } - let container_name = container_name.into(); + /// Set the Azure Account (required) + pub fn with_account(mut self, account: impl Into) -> Self { + self.account = Some(account.into()); + self + } - let container_client = storage_client.as_container_client(&container_name); + /// Set the Azure Access Key (required) + pub fn with_access_key(mut self, access_key: impl Into) -> Self { + self.access_key = Some(access_key.into()); + self + } - Ok(MicrosoftAzure { - container_client, - container_name, - blob_base_url, - is_emulator, - }) + /// Set the Azure Container Name (required) + pub fn with_container_name(mut self, container_name: impl Into) -> Self { + self.container_name = Some(container_name.into()); + self + } + + /// Set if the Azure emulator should be used (defaults to false) + pub fn with_use_emulator(mut self, use_emulator: bool) -> Self { + self.use_emulator = use_emulator; + self + } + + /// Configure a connection to container with given name on Microsoft Azure + /// Blob store. + pub fn build(self) -> Result { + let Self { + account, + access_key, + container_name, + use_emulator, + } = self; + + let account = account.ok_or(Error::MissingAccount {})?; + let access_key = access_key.ok_or(Error::MissingAccessKey {})?; + let container_name = container_name.ok_or(Error::MissingContainerName {})?; + + let http_client: Arc = Arc::new(reqwest::Client::new()); + + let (is_emulator, storage_account_client) = if use_emulator { + check_if_emulator_works()?; + // Allow overriding defaults. Values taken from + // from https://docs.rs/azure_storage/0.2.0/src/azure_storage/core/clients/storage_account_client.rs.html#129-141 + let http_client = azure_core::new_http_client(); + let blob_storage_url = + url_from_env("AZURITE_BLOB_STORAGE_URL", "http://127.0.0.1:10000")?; + let queue_storage_url = + url_from_env("AZURITE_QUEUE_STORAGE_URL", "http://127.0.0.1:10001")?; + let table_storage_url = + url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10002")?; + let filesystem_url = + url_from_env("AZURITE_TABLE_STORAGE_URL", "http://127.0.0.1:10004")?; + + let storage_client = StorageAccountClient::new_emulator( + http_client, + &blob_storage_url, + &table_storage_url, + &queue_storage_url, + &filesystem_url, + ); + + (true, storage_client) + } else { + ( + false, + StorageAccountClient::new_access_key( + Arc::clone(&http_client), + &account, + &access_key, + ), + ) + }; + + let storage_client = storage_account_client.as_storage_client(); + let blob_base_url = storage_account_client + .blob_storage_url() + .as_ref() + // make url ending consistent between the emulator and remote storage account + .trim_end_matches('/') + .to_string(); + + let container_client = storage_client.as_container_client(&container_name); + + Ok(MicrosoftAzure { + container_client, + container_name, + blob_base_url, + is_emulator, + }) + } } // Relevant docs: https://azure.github.io/Storage/docs/application-and-user-data/basics/azure-blob-storage-upload-apis/ @@ -729,21 +792,13 @@ impl CloudMultiPartUploadImpl for AzureMultiPartUpload { #[cfg(test)] mod tests { - use crate::azure::new_azure; + use super::*; use crate::tests::{ copy_if_not_exists, list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, rename_and_copy, }; use std::env; - #[derive(Debug)] - struct AzureConfig { - storage_account: String, - access_key: String, - bucket: String, - use_emulator: bool, - } - // Helper macro to skip tests if TEST_INTEGRATION and the Azure environment // variables are not set. macro_rules! maybe_skip_integration { @@ -785,33 +840,28 @@ mod tests { ); return; } else { - AzureConfig { - storage_account: env::var("AZURE_STORAGE_ACCOUNT") - .unwrap_or_default(), - access_key: env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), - bucket: env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - use_emulator, - } + MicrosoftAzureBuilder::new() + .with_account(env::var("AZURE_STORAGE_ACCOUNT").unwrap_or_default()) + .with_access_key( + env::var("AZURE_STORAGE_ACCESS_KEY").unwrap_or_default(), + ) + .with_container_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET"), + ) + .with_use_emulator(use_emulator) } }}; } #[tokio::test] async fn azure_blob_test() { - let config = maybe_skip_integration!(); - let integration = new_azure( - config.storage_account, - config.access_key, - config.bucket, - config.use_emulator, - ) - .unwrap(); - - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; } } diff --git a/object_store/src/client/backoff.rs b/object_store/src/client/backoff.rs new file mode 100644 index 000000000000..5a6126cc45c6 --- /dev/null +++ b/object_store/src/client/backoff.rs @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use rand::prelude::*; +use std::time::Duration; + +/// Exponential backoff with jitter +/// +/// See +#[allow(missing_copy_implementations)] +#[derive(Debug, Clone)] +pub struct BackoffConfig { + /// The initial backoff duration + pub init_backoff: Duration, + /// The maximum backoff duration + pub max_backoff: Duration, + /// The base of the exponential to use + pub base: f64, +} + +impl Default for BackoffConfig { + fn default() -> Self { + Self { + init_backoff: Duration::from_millis(100), + max_backoff: Duration::from_secs(15), + base: 2., + } + } +} + +/// [`Backoff`] can be created from a [`BackoffConfig`] +/// +/// Consecutive calls to [`Backoff::next`] will return the next backoff interval +/// +pub struct Backoff { + init_backoff: f64, + next_backoff_secs: f64, + max_backoff_secs: f64, + base: f64, + rng: Option>, +} + +impl std::fmt::Debug for Backoff { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Backoff") + .field("init_backoff", &self.init_backoff) + .field("next_backoff_secs", &self.next_backoff_secs) + .field("max_backoff_secs", &self.max_backoff_secs) + .field("base", &self.base) + .finish() + } +} + +impl Backoff { + /// Create a new [`Backoff`] from the provided [`BackoffConfig`] + pub fn new(config: &BackoffConfig) -> Self { + Self::new_with_rng(config, None) + } + + /// Creates a new `Backoff` with the optional `rng` + /// + /// Used [`rand::thread_rng()`] if no rng provided + pub fn new_with_rng( + config: &BackoffConfig, + rng: Option>, + ) -> Self { + let init_backoff = config.init_backoff.as_secs_f64(); + Self { + init_backoff, + next_backoff_secs: init_backoff, + max_backoff_secs: config.max_backoff.as_secs_f64(), + base: config.base, + rng, + } + } + + /// Returns the next backoff duration to wait for + pub fn next(&mut self) -> Duration { + let range = self.init_backoff..(self.next_backoff_secs * self.base); + + let rand_backoff = match self.rng.as_mut() { + Some(rng) => rng.gen_range(range), + None => thread_rng().gen_range(range), + }; + + let next_backoff = self.max_backoff_secs.min(rand_backoff); + Duration::from_secs_f64(std::mem::replace( + &mut self.next_backoff_secs, + next_backoff, + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::rngs::mock::StepRng; + + #[test] + fn test_backoff() { + let init_backoff_secs = 1.; + let max_backoff_secs = 500.; + let base = 3.; + + let config = BackoffConfig { + init_backoff: Duration::from_secs_f64(init_backoff_secs), + max_backoff: Duration::from_secs_f64(max_backoff_secs), + base, + }; + + let assert_fuzzy_eq = + |a: f64, b: f64| assert!((b - a).abs() < 0.0001, "{} != {}", a, b); + + // Create a static rng that takes the minimum of the range + let rng = Box::new(StepRng::new(0, 0)); + let mut backoff = Backoff::new_with_rng(&config, Some(rng)); + + for _ in 0..20 { + assert_eq!(backoff.next().as_secs_f64(), init_backoff_secs); + } + + // Create a static rng that takes the maximum of the range + let rng = Box::new(StepRng::new(u64::MAX, 0)); + let mut backoff = Backoff::new_with_rng(&config, Some(rng)); + + for i in 0..20 { + let value = (base.powi(i) * init_backoff_secs).min(max_backoff_secs); + assert_fuzzy_eq(backoff.next().as_secs_f64(), value); + } + + // Create a static rng that takes the mid point of the range + let rng = Box::new(StepRng::new(u64::MAX / 2, 0)); + let mut backoff = Backoff::new_with_rng(&config, Some(rng)); + + let mut value = init_backoff_secs; + for _ in 0..20 { + assert_fuzzy_eq(backoff.next().as_secs_f64(), value); + value = (init_backoff_secs + (value * base - init_backoff_secs) / 2.) + .min(max_backoff_secs); + } + } +} diff --git a/object_store/src/client/mod.rs b/object_store/src/client/mod.rs new file mode 100644 index 000000000000..1166ebe7a525 --- /dev/null +++ b/object_store/src/client/mod.rs @@ -0,0 +1,23 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Generic utilities reqwest based ObjectStore implementations + +pub mod backoff; +pub mod oauth; +pub mod retry; +pub mod token; diff --git a/object_store/src/oauth.rs b/object_store/src/client/oauth.rs similarity index 96% rename from object_store/src/oauth.rs rename to object_store/src/client/oauth.rs index 273e37b64922..88e7a7b0f9e8 100644 --- a/object_store/src/oauth.rs +++ b/object_store/src/client/oauth.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -use crate::token::TemporaryToken; +use crate::client::retry::RetryExt; +use crate::client::token::TemporaryToken; +use crate::RetryConfig; use reqwest::{Client, Method}; use ring::signature::RsaKeyPair; use snafu::{ResultExt, Snafu}; @@ -133,7 +135,11 @@ impl OAuthProvider { } /// Fetch a fresh token - pub async fn fetch_token(&self, client: &Client) -> Result> { + pub async fn fetch_token( + &self, + client: &Client, + retry: &RetryConfig, + ) -> Result> { let now = seconds_since_epoch(); let exp = now + 3600; @@ -168,7 +174,7 @@ impl OAuthProvider { let response: TokenResponse = client .request(Method::POST, &self.audience) .form(&body) - .send() + .send_retry(retry) .await .context(TokenRequestSnafu)? .error_for_status() diff --git a/object_store/src/client/retry.rs b/object_store/src/client/retry.rs new file mode 100644 index 000000000000..c4dd6ee934cb --- /dev/null +++ b/object_store/src/client/retry.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! A shared HTTP client implementation incorporating retries + +use crate::client::backoff::{Backoff, BackoffConfig}; +use futures::future::BoxFuture; +use futures::FutureExt; +use reqwest::{Response, Result}; +use std::time::{Duration, Instant}; +use tracing::info; + +/// Contains the configuration for how to respond to server errors +/// +/// By default they will be retried up to some limit, using exponential +/// backoff with jitter. See [`BackoffConfig`] for more information +/// +#[derive(Debug, Clone)] +pub struct RetryConfig { + /// The backoff configuration + pub backoff: BackoffConfig, + + /// The maximum number of times to retry a request + /// + /// Set to 0 to disable retries + pub max_retries: usize, + + /// The maximum length of time from the initial request + /// after which no further retries will be attempted + /// + /// This not only bounds the length of time before a server + /// error will be surfaced to the application, but also bounds + /// the length of time a request's credentials must remain valid. + /// + /// As requests are retried without renewing credentials or + /// regenerating request payloads, this number should be kept + /// below 5 minutes to avoid errors due to expired credentials + /// and/or request payloads + pub retry_timeout: Duration, +} + +impl Default for RetryConfig { + fn default() -> Self { + Self { + backoff: Default::default(), + max_retries: 10, + retry_timeout: Duration::from_secs(3 * 60), + } + } +} + +pub trait RetryExt { + /// Dispatch a request with the given retry configuration + /// + /// # Panic + /// + /// This will panic if the request body is a stream + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result>; +} + +impl RetryExt for reqwest::RequestBuilder { + fn send_retry(self, config: &RetryConfig) -> BoxFuture<'static, Result> { + let mut backoff = Backoff::new(&config.backoff); + let max_retries = config.max_retries; + let retry_timeout = config.retry_timeout; + + async move { + let mut retries = 0; + let now = Instant::now(); + + loop { + let s = self.try_clone().expect("request body must be cloneable"); + match s.send().await { + Err(e) + if retries < max_retries + && now.elapsed() < retry_timeout + && e.status() + .map(|s| s.is_server_error()) + .unwrap_or(false) => + { + let sleep = backoff.next(); + retries += 1; + info!("Encountered server error, backing off for {} seconds, retry {} of {}", sleep.as_secs_f32(), retries, max_retries); + tokio::time::sleep(sleep).await; + } + r => return r, + } + } + } + .boxed() + } +} diff --git a/object_store/src/token.rs b/object_store/src/client/token.rs similarity index 100% rename from object_store/src/token.rs rename to object_store/src/client/token.rs diff --git a/object_store/src/gcp.rs b/object_store/src/gcp.rs index d740625bd92f..0dc5a956ac08 100644 --- a/object_store/src/gcp.rs +++ b/object_store/src/gcp.rs @@ -46,14 +46,13 @@ use reqwest::{header, Client, Method, Response, StatusCode}; use snafu::{ResultExt, Snafu}; use tokio::io::AsyncWrite; -use crate::multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}; -use crate::util::format_http_range; +use crate::client::retry::RetryExt; use crate::{ - oauth::OAuthProvider, + client::{oauth::OAuthProvider, token::TokenCache}, + multipart::{CloudMultiPartUpload, CloudMultiPartUploadImpl, UploadPart}, path::{Path, DELIMITER}, - token::TokenCache, - util::format_prefix, - GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + util::{format_http_range, format_prefix}, + GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, RetryConfig, }; #[derive(Debug, Snafu)] @@ -98,6 +97,12 @@ enum Error { #[snafu(display("Error decoding object size: {}", source))] InvalidSize { source: std::num::ParseIntError }, + + #[snafu(display("Missing bucket name"))] + MissingBucketName {}, + + #[snafu(display("Missing service account path"))] + MissingServiceAccountPath, } impl From for super::Error { @@ -186,7 +191,7 @@ struct CompleteMultipartUpload { parts: Vec, } -/// Configuration for connecting to [Google Cloud Storage](https://cloud.google.com/storage/). +/// Interface for [Google Cloud Storage](https://cloud.google.com/storage/). #[derive(Debug)] pub struct GoogleCloudStorage { client: Arc, @@ -209,6 +214,8 @@ struct GoogleCloudStorageClient { bucket_name: String, bucket_name_encoded: String, + retry_config: RetryConfig, + // TODO: Hook this up in tests max_list_results: Option, } @@ -218,7 +225,9 @@ impl GoogleCloudStorageClient { if let Some(oauth_provider) = &self.oauth_provider { Ok(self .token_cache - .get_or_insert_with(|| oauth_provider.fetch_token(&self.client)) + .get_or_insert_with(|| { + oauth_provider.fetch_token(&self.client, &self.retry_config) + }) .await?) } else { Ok("".to_owned()) @@ -258,7 +267,7 @@ impl GoogleCloudStorageClient { let response = builder .bearer_auth(token) .query(&[("alt", alt)]) - .send() + .send_retry(&self.retry_config) .await .context(GetRequestSnafu { path: path.as_ref(), @@ -286,7 +295,7 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_LENGTH, payload.len()) .query(&[("uploadType", "media"), ("name", path.as_ref())]) .body(payload) - .send() + .send_retry(&self.retry_config) .await .context(PutRequestSnafu)? .error_for_status() @@ -307,7 +316,7 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, "0") .query(&[("uploads", "")]) - .send() + .send_retry(&self.retry_config) .await .context(PutRequestSnafu)? .error_for_status() @@ -341,7 +350,7 @@ impl GoogleCloudStorageClient { .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, "0") .query(&[("uploadId", multipart_id)]) - .send() + .send_retry(&self.retry_config) .await .context(PutRequestSnafu)? .error_for_status() @@ -358,7 +367,7 @@ impl GoogleCloudStorageClient { let builder = self.client.request(Method::DELETE, url); builder .bearer_auth(token) - .send() + .send_retry(&self.retry_config) .await .context(DeleteRequestSnafu { path: path.as_ref(), @@ -401,7 +410,7 @@ impl GoogleCloudStorageClient { builder .bearer_auth(token) - .send() + .send_retry(&self.retry_config) .await .context(CopyRequestSnafu { path: from.as_ref(), @@ -450,7 +459,7 @@ impl GoogleCloudStorageClient { .request(Method::GET, url) .query(&query) .bearer_auth(token) - .send() + .send_retry(&self.retry_config) .await .context(ListRequestSnafu)? .error_for_status() @@ -566,7 +575,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .header(header::CONTENT_TYPE, "application/octet-stream") .header(header::CONTENT_LENGTH, format!("{}", buf.len())) .body(buf) - .send() + .send_retry(&client.retry_config) .await .map_err(reqwest_error_as_io)? .error_for_status() @@ -637,7 +646,7 @@ impl CloudMultiPartUploadImpl for GCSMultipartUpload { .bearer_auth(token) .query(&[("uploadId", upload_id)]) .body(data) - .send() + .send_retry(&client.retry_config) .await .map_err(reqwest_error_as_io)? .error_for_status() @@ -779,55 +788,131 @@ fn reader_credentials_file( Ok(serde_json::from_reader(reader).context(DecodeCredentialsSnafu)?) } -/// Configure a connection to Google Cloud Storage. -pub fn new_gcs( - service_account_path: impl AsRef, - bucket_name: impl Into, -) -> Result { - new_gcs_with_client(service_account_path, bucket_name, Client::new()) +/// Configure a connection to Google Cloud Storage using the specified +/// credentials. +/// +/// # Example +/// ``` +/// # let BUCKET_NAME = "foo"; +/// # let SERVICE_ACCOUNT_PATH = "/tmp/foo.json"; +/// # use object_store::gcp::GoogleCloudStorageBuilder; +/// let gcs = GoogleCloudStorageBuilder::new() +/// .with_service_account_path(SERVICE_ACCOUNT_PATH) +/// .with_bucket_name(BUCKET_NAME) +/// .build(); +/// ``` +#[derive(Debug, Default)] +pub struct GoogleCloudStorageBuilder { + bucket_name: Option, + service_account_path: Option, + client: Option, + retry_config: RetryConfig, } -/// Configure a connection to Google Cloud Storage with the specified HTTP client. -pub fn new_gcs_with_client( - service_account_path: impl AsRef, - bucket_name: impl Into, - client: Client, -) -> Result { - let credentials = reader_credentials_file(service_account_path)?; - - // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes - let scope = "https://www.googleapis.com/auth/devstorage.full_control"; - let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); - - let oauth_provider = (!credentials.disable_oauth) - .then(|| { - OAuthProvider::new( - credentials.client_email, - credentials.private_key, - scope.to_string(), - audience, - ) - }) - .transpose()?; +impl GoogleCloudStorageBuilder { + /// Create a new [`GoogleCloudStorageBuilder`] with default values. + pub fn new() -> Self { + Default::default() + } - let bucket_name = bucket_name.into(); - let encoded_bucket_name = - percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + /// Set the bucket name (required) + pub fn with_bucket_name(mut self, bucket_name: impl Into) -> Self { + self.bucket_name = Some(bucket_name.into()); + self + } - // The cloud storage crate currently only supports authentication via - // environment variables. Set the environment variable explicitly so - // that we can optionally accept command line arguments instead. - Ok(GoogleCloudStorage { - client: Arc::new(GoogleCloudStorageClient { - client, - base_url: credentials.gcs_base_url, - oauth_provider, - token_cache: Default::default(), + /// Set the path to the service account file (required). Example + /// `"/tmp/gcs.json"` + /// + /// Example contents of `gcs.json`: + /// + /// ```json + /// { + /// "gcs_base_url": "https://localhost:4443", + /// "disable_oauth": true, + /// "client_email": "", + /// "private_key": "" + /// } + /// ``` + pub fn with_service_account_path( + mut self, + service_account_path: impl Into, + ) -> Self { + self.service_account_path = Some(service_account_path.into()); + self + } + + /// Set the retry configuration + pub fn with_retry(mut self, retry_config: RetryConfig) -> Self { + self.retry_config = retry_config; + self + } + + /// Use the specified http [`Client`] (defaults to [`Client::new`]) + /// + /// This allows you to set custom client options such as allowing + /// non secure connections or custom headers. + /// + /// NOTE: Currently only available in `test`s to facilitate + /// testing, to avoid leaking details and preserve our ability to + /// make changes to the implementation. + #[cfg(test)] + pub fn with_client(mut self, client: Client) -> Self { + self.client = Some(client); + self + } + + /// Configure a connection to Google Cloud Storage, returning a + /// new [`GoogleCloudStorage`] and consuming `self` + pub fn build(self) -> Result { + let Self { bucket_name, - bucket_name_encoded: encoded_bucket_name, - max_list_results: None, - }), - }) + service_account_path, + client, + retry_config, + } = self; + + let bucket_name = bucket_name.ok_or(Error::MissingBucketName {})?; + let service_account_path = + service_account_path.ok_or(Error::MissingServiceAccountPath)?; + let client = client.unwrap_or_else(Client::new); + + let credentials = reader_credentials_file(service_account_path)?; + + // TODO: https://cloud.google.com/storage/docs/authentication#oauth-scopes + let scope = "https://www.googleapis.com/auth/devstorage.full_control"; + let audience = "https://www.googleapis.com/oauth2/v4/token".to_string(); + + let oauth_provider = (!credentials.disable_oauth) + .then(|| { + OAuthProvider::new( + credentials.client_email, + credentials.private_key, + scope.to_string(), + audience, + ) + }) + .transpose()?; + + let encoded_bucket_name = + percent_encode(bucket_name.as_bytes(), NON_ALPHANUMERIC).to_string(); + + // The cloud storage crate currently only supports authentication via + // environment variables. Set the environment variable explicitly so + // that we can optionally accept command line arguments instead. + Ok(GoogleCloudStorage { + client: Arc::new(GoogleCloudStorageClient { + client, + base_url: credentials.gcs_base_url, + oauth_provider, + token_cache: Default::default(), + bucket_name, + bucket_name_encoded: encoded_bucket_name, + retry_config, + max_list_results: None, + }), + }) + } } fn convert_object_meta(object: &Object) -> Result { @@ -860,24 +945,6 @@ mod test { const NON_EXISTENT_NAME: &str = "nonexistentname"; - #[derive(Debug)] - struct GoogleCloudConfig { - bucket: String, - service_account: String, - } - - impl GoogleCloudConfig { - fn build_test(self) -> Result { - // ignore HTTPS errors in tests so we can use fake-gcs server - let client = Client::builder() - .danger_accept_invalid_certs(true) - .build() - .expect("Error creating http client for testing"); - - new_gcs_with_client(self.service_account, self.bucket, client) - } - } - // Helper macro to skip tests if TEST_INTEGRATION and the GCP environment variables are not set. macro_rules! maybe_skip_integration { () => {{ @@ -912,36 +979,44 @@ mod test { ); return; } else { - GoogleCloudConfig { - bucket: env::var("OBJECT_STORE_BUCKET") - .expect("already checked OBJECT_STORE_BUCKET"), - service_account: env::var("GOOGLE_SERVICE_ACCOUNT") - .expect("already checked GOOGLE_SERVICE_ACCOUNT"), - } + GoogleCloudStorageBuilder::new() + .with_bucket_name( + env::var("OBJECT_STORE_BUCKET") + .expect("already checked OBJECT_STORE_BUCKET") + ) + .with_service_account_path( + env::var("GOOGLE_SERVICE_ACCOUNT") + .expect("already checked GOOGLE_SERVICE_ACCOUNT") + ) + .with_client( + // ignore HTTPS errors in tests so we can use fake-gcs server + Client::builder() + .danger_accept_invalid_certs(true) + .build() + .expect("Error creating http client for testing") + ) } }}; } #[tokio::test] async fn gcs_test() { - let config = maybe_skip_integration!(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; if integration.client.base_url == default_gcs_base_url() { // Fake GCS server does not yet implement XML Multipart uploads // https://github.com/fsouza/fake-gcs-server/issues/852 - stream_get(&integration).await.unwrap(); + stream_get(&integration).await; } } #[tokio::test] async fn gcs_test_get_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -956,9 +1031,10 @@ mod test { #[tokio::test] async fn gcs_test_get_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!() + .with_bucket_name(NON_EXISTENT_NAME) + .build() + .unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -975,8 +1051,7 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_location() { - let config = maybe_skip_integration!(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!().build().unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -990,9 +1065,10 @@ mod test { #[tokio::test] async fn gcs_test_delete_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!() + .with_bucket_name(NON_EXISTENT_NAME) + .build() + .unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); @@ -1006,9 +1082,10 @@ mod test { #[tokio::test] async fn gcs_test_put_nonexistent_bucket() { - let mut config = maybe_skip_integration!(); - config.bucket = NON_EXISTENT_NAME.into(); - let integration = config.build_test().unwrap(); + let integration = maybe_skip_integration!() + .with_bucket_name(NON_EXISTENT_NAME) + .build() + .unwrap(); let location = Path::from_iter([NON_EXISTENT_NAME]); let data = Bytes::from("arbitrary data"); diff --git a/object_store/src/lib.rs b/object_store/src/lib.rs index 54d28273fa97..f7adedb2682c 100644 --- a/object_store/src/lib.rs +++ b/object_store/src/lib.rs @@ -28,15 +28,129 @@ //! # object_store //! -//! This crate provides APIs for interacting with object storage services. +//! This crate provides a uniform API for interacting with object storage services and +//! local files via the the [`ObjectStore`] trait. //! -//! It currently supports PUT (single or chunked/concurrent), GET, DELETE, HEAD and list for: +//! # Create an [`ObjectStore`] implementation: //! -//! * [Google Cloud Storage](https://cloud.google.com/storage/) -//! * [Amazon S3](https://aws.amazon.com/s3/) -//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/#overview) -//! * In-memory -//! * Local file storage +//! * [Google Cloud Storage](https://cloud.google.com/storage/): [`GoogleCloudStorageBuilder`](gcp::GoogleCloudStorageBuilder) +//! * [Amazon S3](https://aws.amazon.com/s3/): [`AmazonS3Builder`](aws::AmazonS3Builder) +//! * [Azure Blob Storage](https://azure.microsoft.com/en-gb/services/storage/blobs/):: [`MicrosoftAzureBuilder`](azure::MicrosoftAzureBuilder) +//! * In Memory: [`InMemory`](memory::InMemory) +//! * Local filesystem: [`LocalFileSystem`](local::LocalFileSystem) +//! +//! # Adapters +//! +//! [`ObjectStore`] instances can be composed with various adapters +//! which add additional functionality: +//! +//! * Rate Throttling: [`ThrottleConfig`](throttle::ThrottleConfig) +//! * Concurrent Request Limit: [`LimitStore`](limit::LimitStore) +//! +//! +//! # Listing objects: +//! +//! Use the [`ObjectStore::list`] method to iterate over objects in +//! remote storage or files in the local filesystem: +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # // use LocalFileSystem for example +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! +//! # async fn example() { +//! use std::sync::Arc; +//! use object_store::{path::Path, ObjectStore}; +//! use futures::stream::StreamExt; +//! +//! // create an ObjectStore +//! let object_store: Arc = Arc::new(get_object_store()); +//! +//! // Recursively list all files below the 'data' path. +//! // 1. On AWS S3 this would be the 'data/' prefix +//! // 2. On a local filesystem, this would be the 'data' directory +//! let prefix: Path = "data".try_into().unwrap(); +//! +//! // Get an `async` stream of Metadata objects: +//! let list_stream = object_store +//! .list(Some(&prefix)) +//! .await +//! .expect("Error listing files"); +//! +//! // Print a line about each object based on its metadata +//! // using for_each from `StreamExt` trait. +//! list_stream +//! .for_each(move |meta| { +//! async { +//! let meta = meta.expect("Error listing"); +//! println!("Name: {}, size: {}", meta.location, meta.size); +//! } +//! }) +//! .await; +//! # } +//! ``` +//! +//! Which will print out something like the following: +//! +//! ```text +//! Name: data/file01.parquet, size: 112832 +//! Name: data/file02.parquet, size: 143119 +//! Name: data/child/file03.parquet, size: 100 +//! ... +//! ``` +//! +//! # Fetching objects +//! +//! Use the [`ObjectStore::get`] method to fetch the data bytes +//! from remote storage or files in the local filesystem as a stream. +//! +//! ``` +//! # use object_store::local::LocalFileSystem; +//! # // use LocalFileSystem for example +//! # fn get_object_store() -> LocalFileSystem { +//! # LocalFileSystem::new_with_prefix("/tmp").unwrap() +//! # } +//! +//! # async fn example() { +//! use std::sync::Arc; +//! use object_store::{path::Path, ObjectStore}; +//! use futures::stream::StreamExt; +//! +//! // create an ObjectStore +//! let object_store: Arc = Arc::new(get_object_store()); +//! +//! // Retrieve a specific file +//! let path: Path = "data/file01.parquet".try_into().unwrap(); +//! +//! // fetch the bytes from object store +//! let stream = object_store +//! .get(&path) +//! .await +//! .unwrap() +//! .into_stream(); +//! +//! // Count the '0's using `map` from `StreamExt` trait +//! let num_zeros = stream +//! .map(|bytes| { +//! let bytes = bytes.unwrap(); +//! bytes.iter().filter(|b| **b == 0).count() +//! }) +//! .collect::>() +//! .await +//! .into_iter() +//! .sum::(); +//! +//! println!("Num zeros in {} is {}", path, num_zeros); +//! # } +//! ``` +//! +//! Which will print out something like the following: +//! +//! ```text +//! Num zeros in data/file01.parquet is 657 +//! ``` //! #[cfg(feature = "aws")] @@ -45,23 +159,26 @@ pub mod aws; pub mod azure; #[cfg(feature = "gcp")] pub mod gcp; +pub mod limit; pub mod local; pub mod memory; pub mod path; pub mod throttle; #[cfg(feature = "gcp")] -mod oauth; +mod client; #[cfg(feature = "gcp")] -mod token; +pub use client::{backoff::BackoffConfig, retry::RetryConfig}; #[cfg(any(feature = "azure", feature = "aws", feature = "gcp"))] mod multipart; mod util; use crate::path::Path; -use crate::util::{collect_bytes, maybe_spawn_blocking}; +use crate::util::{ + coalesce_ranges, collect_bytes, maybe_spawn_blocking, OBJECT_STORE_COALESCE_DEFAULT, +}; use async_trait::async_trait; use bytes::Bytes; use chrono::{DateTime, Utc}; @@ -116,6 +233,21 @@ pub trait ObjectStore: std::fmt::Display + Send + Sync + Debug + 'static { /// in the given byte range async fn get_range(&self, location: &Path, range: Range) -> Result; + /// Return the bytes that are stored at the specified location + /// in the given byte ranges + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + coalesce_ranges( + ranges, + |range| self.get_range(location, range), + OBJECT_STORE_COALESCE_DEFAULT, + ) + .await + } + /// Return the metadata for the specified location async fn head(&self, location: &Path) -> Result; @@ -181,7 +313,7 @@ pub struct ListResult { } /// The metadata that describes an object. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct ObjectMeta { /// The full path to the object pub location: Path, @@ -336,7 +468,7 @@ pub enum Error { #[cfg(feature = "gcp")] #[snafu(display("OAuth error: {}", source), context(false))] - OAuth { source: oauth::Error }, + OAuth { source: client::oauth::Error }, } #[cfg(test)] @@ -363,15 +495,12 @@ mod tests { use crate::test_util::flatten_list_stream; use tokio::io::AsyncWriteExt; - type Error = Box; - type Result = std::result::Result; - - pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn put_get_delete_list(storage: &DynObjectStore) { let store_str = storage.to_string(); delete_fixtures(storage).await; - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), "Expected list to be empty; found: {:?}", @@ -382,16 +511,16 @@ mod tests { let data = Bytes::from("arbitrary data"); let expected_data = data.clone(); - storage.put(&location, data).await?; + storage.put(&location, data).await.unwrap(); let root = Path::from("/"); // List everything - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // Should behave the same as no prefix - let content_list = flatten_list_stream(storage, Some(&root)).await?; + let content_list = flatten_list_stream(storage, Some(&root)).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // List with delimiter @@ -408,15 +537,15 @@ mod tests { // List everything starting with a prefix that should return results let prefix = Path::from("test_dir"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location.clone()]); // List everything starting with a prefix that shouldn't return results let prefix = Path::from("something"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert!(content_list.is_empty()); - let read_data = storage.get(&location).await?.bytes().await?; + let read_data = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(&*read_data, expected_data); // Test range request @@ -440,14 +569,20 @@ mod tests { // Should be a non-fatal error out_of_range_result.unwrap_err(); + + let ranges = vec![0..1, 2..3, 0..5]; + let bytes = storage.get_ranges(&location, &ranges).await.unwrap(); + for (range, bytes) in ranges.iter().zip(bytes) { + assert_eq!(bytes, expected_data.slice(range.clone())) + } } - let head = storage.head(&location).await?; + let head = storage.head(&location).await.unwrap(); assert_eq!(head.size, expected_data.len()); - storage.delete(&location).await?; + storage.delete(&location).await.unwrap(); - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); let err = storage.get(&location).await.unwrap_err(); @@ -520,13 +655,54 @@ mod tests { assert_eq!(files, vec![emoji_file.clone()]); + let dst = Path::from("foo.parquet"); + storage.copy(&emoji_file, &dst).await.unwrap(); + let mut files = flatten_list_stream(storage, None).await.unwrap(); + files.sort_unstable(); + assert_eq!(files, vec![emoji_file.clone(), dst.clone()]); + storage.delete(&emoji_file).await.unwrap(); + storage.delete(&dst).await.unwrap(); let files = flatten_list_stream(storage, Some(&emoji_prefix)) .await .unwrap(); assert!(files.is_empty()); - Ok(()) + // Test handling of paths containing percent-encoded sequences + + // "HELLO" percent encoded + let hello_prefix = Path::parse("%48%45%4C%4C%4F").unwrap(); + let path = hello_prefix.child("foo.parquet"); + + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + let files = flatten_list_stream(storage, Some(&hello_prefix)) + .await + .unwrap(); + assert_eq!(files, vec![path.clone()]); + + // Cannot list by decoded representation + let files = flatten_list_stream(storage, Some(&Path::from("HELLO"))) + .await + .unwrap(); + assert!(files.is_empty()); + + // Cannot access by decoded representation + let err = storage + .head(&Path::from("HELLO/foo.parquet")) + .await + .unwrap_err(); + assert!(matches!(err, crate::Error::NotFound { .. }), "{}", err); + + storage.delete(&path).await.unwrap(); + + // Can also write non-percent encoded sequences + let path = Path::parse("%Q.parquet").unwrap(); + storage.put(&path, Bytes::from(vec![0, 1])).await.unwrap(); + + let files = flatten_list_stream(storage, None).await.unwrap(); + assert_eq!(files, vec![path.clone()]); + + storage.delete(&path).await.unwrap(); } fn get_vec_of_bytes(chunk_length: usize, num_chunks: usize) -> Vec { @@ -535,15 +711,15 @@ mod tests { .collect() } - pub(crate) async fn stream_get(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn stream_get(storage: &DynObjectStore) { let location = Path::from("test_dir/test_upload_file.txt"); // Can write to storage let data = get_vec_of_bytes(5_000_000, 10); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await?; + let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { - writer.write_all(chunk).await?; + writer.write_all(chunk).await.unwrap(); } // Object should not yet exist in store @@ -554,26 +730,29 @@ mod tests { crate::Error::NotFound { .. } )); - writer.shutdown().await?; - let bytes_written = storage.get(&location).await?.bytes().await?; + writer.shutdown().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); // Can overwrite some storage let data = get_vec_of_bytes(5_000, 5); let bytes_expected = data.concat(); - let (_, mut writer) = storage.put_multipart(&location).await?; + let (_, mut writer) = storage.put_multipart(&location).await.unwrap(); for chunk in &data { - writer.write_all(chunk).await?; + writer.write_all(chunk).await.unwrap(); } - writer.shutdown().await?; - let bytes_written = storage.get(&location).await?.bytes().await?; + writer.shutdown().await.unwrap(); + let bytes_written = storage.get(&location).await.unwrap().bytes().await.unwrap(); assert_eq!(bytes_expected, bytes_written); // We can abort an empty write let location = Path::from("test_dir/test_abort_upload.txt"); - let (upload_id, writer) = storage.put_multipart(&location).await?; + let (upload_id, writer) = storage.put_multipart(&location).await.unwrap(); drop(writer); - storage.abort_multipart(&location, &upload_id).await?; + storage + .abort_multipart(&location, &upload_id) + .await + .unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( @@ -582,30 +761,29 @@ mod tests { )); // We can abort an in-progress write - let (upload_id, mut writer) = storage.put_multipart(&location).await?; + let (upload_id, mut writer) = storage.put_multipart(&location).await.unwrap(); if let Some(chunk) = data.get(0) { - writer.write_all(chunk).await?; - let _ = writer.write(chunk).await?; + writer.write_all(chunk).await.unwrap(); + let _ = writer.write(chunk).await.unwrap(); } drop(writer); - storage.abort_multipart(&location, &upload_id).await?; + storage + .abort_multipart(&location, &upload_id) + .await + .unwrap(); let get_res = storage.get(&location).await; assert!(get_res.is_err()); assert!(matches!( get_res.unwrap_err(), crate::Error::NotFound { .. } )); - - Ok(()) } - pub(crate) async fn list_uses_directories_correctly( - storage: &DynObjectStore, - ) -> Result<()> { + pub(crate) async fn list_uses_directories_correctly(storage: &DynObjectStore) { delete_fixtures(storage).await; - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!( content_list.is_empty(), "Expected list to be empty; found: {:?}", @@ -616,25 +794,23 @@ mod tests { let location2 = Path::from("foo.bar/y.json"); let data = Bytes::from("arbitrary data"); - storage.put(&location1, data.clone()).await?; - storage.put(&location2, data).await?; + storage.put(&location1, data.clone()).await.unwrap(); + storage.put(&location2, data).await.unwrap(); let prefix = Path::from("foo"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[location1.clone()]); let prefix = Path::from("foo/x"); - let content_list = flatten_list_stream(storage, Some(&prefix)).await?; + let content_list = flatten_list_stream(storage, Some(&prefix)).await.unwrap(); assert_eq!(content_list, &[]); - - Ok(()) } - pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn list_with_delimiter(storage: &DynObjectStore) { delete_fixtures(storage).await; // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); // ==================== do: create files ==================== @@ -696,10 +872,8 @@ mod tests { } // ==================== check: store is empty ==================== - let content_list = flatten_list_stream(storage, None).await?; + let content_list = flatten_list_stream(storage, None).await.unwrap(); assert!(content_list.is_empty()); - - Ok(()) } pub(crate) async fn get_nonexistent_object( @@ -715,7 +889,7 @@ mod tests { storage.get(&location).await?.bytes().await } - pub(crate) async fn rename_and_copy(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn rename_and_copy(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); let path2 = Path::from("test2"); @@ -723,29 +897,27 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy() make both objects identical - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; - storage.copy(&path1, &path2).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); + storage.copy(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); // rename() copies contents and deletes original - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; - storage.rename(&path1, &path2).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); + storage.rename(&path1, &path2).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); let result = storage.get(&path1).await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); // Clean up - storage.delete(&path2).await?; - - Ok(()) + storage.delete(&path2).await.unwrap(); } - pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) -> Result<()> { + pub(crate) async fn copy_if_not_exists(storage: &DynObjectStore) { // Create two objects let path1 = Path::from("test1"); let path2 = Path::from("test2"); @@ -753,8 +925,8 @@ mod tests { let contents2 = Bytes::from("dogs"); // copy_if_not_exists() errors if destination already exists - storage.put(&path1, contents1.clone()).await?; - storage.put(&path2, contents2.clone()).await?; + storage.put(&path1, contents1.clone()).await.unwrap(); + storage.put(&path2, contents2.clone()).await.unwrap(); let result = storage.copy_if_not_exists(&path1, &path2).await; assert!(result.is_err()); assert!(matches!( @@ -763,19 +935,17 @@ mod tests { )); // copy_if_not_exists() copies contents and allows deleting original - storage.delete(&path2).await?; - storage.copy_if_not_exists(&path1, &path2).await?; - storage.delete(&path1).await?; - let new_contents = storage.get(&path2).await?.bytes().await?; + storage.delete(&path2).await.unwrap(); + storage.copy_if_not_exists(&path1, &path2).await.unwrap(); + storage.delete(&path1).await.unwrap(); + let new_contents = storage.get(&path2).await.unwrap().bytes().await.unwrap(); assert_eq!(&new_contents, &contents1); let result = storage.get(&path1).await; assert!(result.is_err()); assert!(matches!(result.unwrap_err(), crate::Error::NotFound { .. })); // Clean up - storage.delete(&path2).await?; - - Ok(()) + storage.delete(&path2).await.unwrap(); } async fn delete_fixtures(storage: &DynObjectStore) { diff --git a/object_store/src/limit.rs b/object_store/src/limit.rs new file mode 100644 index 000000000000..09c88aa2a4bc --- /dev/null +++ b/object_store/src/limit.rs @@ -0,0 +1,272 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! An object store that limits the maximum concurrency of the wrapped implementation + +use crate::{ + BoxStream, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Path, Result, + StreamExt, +}; +use async_trait::async_trait; +use bytes::Bytes; +use futures::Stream; +use std::io::{Error, IoSlice}; +use std::ops::Range; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use tokio::io::AsyncWrite; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; + +/// Store wrapper that wraps an inner store and limits the maximum number of concurrent +/// object store operations. Where each call to an [`ObjectStore`] member function is +/// considered a single operation, even if it may result in more than one network call +/// +/// ``` +/// # use object_store::memory::InMemory; +/// # use object_store::limit::LimitStore; +/// +/// // Create an in-memory `ObjectStore` limited to 20 concurrent requests +/// let store = LimitStore::new(InMemory::new(), 20); +/// ``` +/// +#[derive(Debug)] +pub struct LimitStore { + inner: T, + max_requests: usize, + semaphore: Arc, +} + +impl LimitStore { + /// Create new limit store that will limit the maximum + /// number of outstanding concurrent requests to + /// `max_requests` + pub fn new(inner: T, max_requests: usize) -> Self { + Self { + inner, + max_requests, + semaphore: Arc::new(Semaphore::new(max_requests)), + } + } +} + +impl std::fmt::Display for LimitStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "LimitStore({}, {})", self.max_requests, self.inner) + } +} + +#[async_trait] +impl ObjectStore for LimitStore { + async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.put(location, bytes).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let (id, write) = self.inner.put_multipart(location).await?; + Ok((id, Box::new(PermitWrapper::new(write, permit)))) + } + + async fn abort_multipart( + &self, + location: &Path, + multipart_id: &MultipartId, + ) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.abort_multipart(location, multipart_id).await + } + + async fn get(&self, location: &Path) -> Result { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + match self.inner.get(location).await? { + r @ GetResult::File(_, _) => Ok(r), + GetResult::Stream(s) => { + Ok(GetResult::Stream(PermitWrapper::new(s, permit).boxed())) + } + } + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.get_range(location, range).await + } + + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.get_ranges(location, ranges).await + } + + async fn head(&self, location: &Path) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.head(location).await + } + + async fn delete(&self, location: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.delete(location).await + } + + async fn list( + &self, + prefix: Option<&Path>, + ) -> Result>> { + let permit = Arc::clone(&self.semaphore).acquire_owned().await.unwrap(); + let s = self.inner.list(prefix).await?; + Ok(PermitWrapper::new(s, permit).boxed()) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.copy(from, to).await + } + + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.rename(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.copy_if_not_exists(from, to).await + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let _permit = self.semaphore.acquire().await.unwrap(); + self.inner.rename_if_not_exists(from, to).await + } +} + +/// Combines an [`OwnedSemaphorePermit`] with some other type +struct PermitWrapper { + inner: T, + #[allow(dead_code)] + permit: OwnedSemaphorePermit, +} + +impl PermitWrapper { + fn new(inner: T, permit: OwnedSemaphorePermit) -> Self { + Self { inner, permit } + } +} + +impl Stream for PermitWrapper { + type Item = T::Item; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.inner).poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +impl AsyncWrite for PermitWrapper { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + Pin::new(&mut self.inner).poll_write(cx, buf) + } + + fn poll_flush( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.inner).poll_flush(cx) + } + + fn poll_shutdown( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + Pin::new(&mut self.inner).poll_shutdown(cx) + } + + fn poll_write_vectored( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + bufs: &[IoSlice<'_>], + ) -> Poll> { + Pin::new(&mut self.inner).poll_write_vectored(cx, bufs) + } + + fn is_write_vectored(&self) -> bool { + self.inner.is_write_vectored() + } +} + +#[cfg(test)] +mod tests { + use crate::limit::LimitStore; + use crate::memory::InMemory; + use crate::tests::{ + list_uses_directories_correctly, list_with_delimiter, put_get_delete_list, + rename_and_copy, stream_get, + }; + use crate::ObjectStore; + use std::time::Duration; + use tokio::time::timeout; + + #[tokio::test] + async fn limit_test() { + let max_requests = 10; + let memory = InMemory::new(); + let integration = LimitStore::new(memory, max_requests); + + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + stream_get(&integration).await; + + let mut streams = Vec::with_capacity(max_requests); + for _ in 0..max_requests { + let stream = integration.list(None).await.unwrap(); + streams.push(stream); + } + + let t = Duration::from_millis(20); + + // Expect to not be able to make another request + assert!(timeout(t, integration.list(None)).await.is_err()); + + // Drop one of the streams + streams.pop(); + + // Can now make another request + integration.list(None).await.unwrap(); + } +} diff --git a/object_store/src/local.rs b/object_store/src/local.rs index e2f133e84d7f..fd3c3592ab56 100644 --- a/object_store/src/local.rs +++ b/object_store/src/local.rs @@ -18,7 +18,7 @@ //! An object store implementation for a local filesystem use crate::{ maybe_spawn_blocking, - path::{filesystem_path_to_url, Path}, + path::{absolute_path_to_url, Path}, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, }; use async_trait::async_trait; @@ -68,56 +68,56 @@ pub(crate) enum Error { #[snafu(display("Unable to create dir {}: {}", path.display(), source))] UnableToCreateDir { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to create file {}: {}", path.display(), err))] UnableToCreateFile { - path: std::path::PathBuf, + path: PathBuf, err: io::Error, }, #[snafu(display("Unable to delete file {}: {}", path.display(), source))] UnableToDeleteFile { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to open file {}: {}", path.display(), source))] UnableToOpenFile { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to read data from file {}: {}", path.display(), source))] UnableToReadBytes { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Out of range of file {}, expected: {}, actual: {}", path.display(), expected, actual))] OutOfRange { - path: std::path::PathBuf, + path: PathBuf, expected: usize, actual: usize, }, #[snafu(display("Unable to copy file from {} to {}: {}", from.display(), to.display(), source))] UnableToCopyFile { - from: std::path::PathBuf, - to: std::path::PathBuf, + from: PathBuf, + to: PathBuf, source: io::Error, }, NotFound { - path: std::path::PathBuf, + path: PathBuf, source: io::Error, }, #[snafu(display("Error seeking file {}: {}", path.display(), source))] Seek { source: io::Error, - path: std::path::PathBuf, + path: PathBuf, }, #[snafu(display("Unable to convert URL \"{}\" to filesystem path", url))] @@ -129,6 +129,12 @@ pub(crate) enum Error { path: String, source: io::Error, }, + + #[snafu(display("Unable to canonicalize filesystem root: {}", path.display()))] + UnableToCanonicalize { + path: PathBuf, + source: io::Error, + }, } impl From for super::Error { @@ -170,6 +176,17 @@ impl From for super::Error { /// /// If not called from a tokio context, this will perform IO on the current thread with /// no additional complexity or overheads +/// +/// # Symlinks +/// +/// [`LocalFileSystem`] will follow symlinks as normal, however, it is worth noting: +/// +/// * Broken symlinks will be silently ignored by listing operations +/// * No effort is made to prevent breaking symlinks when deleting files +/// * Symlinks that resolve to paths outside the root **will** be followed +/// * Mutating a file through one or more symlinks will mutate the underlying file +/// * Deleting a path that resolves to a symlink will only delete the symlink +/// #[derive(Debug)] pub struct LocalFileSystem { config: Arc, @@ -203,29 +220,40 @@ impl LocalFileSystem { } /// Create new filesystem storage with `prefix` applied to all paths + /// + /// Returns an error if the path does not exist + /// pub fn new_with_prefix(prefix: impl AsRef) -> Result { + let path = std::fs::canonicalize(&prefix).context(UnableToCanonicalizeSnafu { + path: prefix.as_ref(), + })?; + Ok(Self { config: Arc::new(Config { - root: filesystem_path_to_url(prefix)?, + root: absolute_path_to_url(path)?, }), }) } } impl Config { - /// Return filesystem path of the given location - fn path_to_filesystem(&self, location: &Path) -> Result { + /// Return an absolute filesystem path of the given location + fn path_to_filesystem(&self, location: &Path) -> Result { let mut url = self.root.clone(); url.path_segments_mut() .expect("url path") + // technically not necessary as Path ignores empty segments + // but avoids creating paths with "//" which look odd in error messages. + .pop_if_empty() .extend(location.parts()); url.to_file_path() .map_err(|_| Error::InvalidUrl { url }.into()) } + /// Resolves the provided absolute filesystem path to a [`Path`] prefix fn filesystem_to_path(&self, location: &std::path::Path) -> Result { - Ok(Path::from_filesystem_path_with_base( + Ok(Path::from_absolute_path_with_base( location, Some(&self.root), )?) @@ -308,26 +336,25 @@ impl ObjectStore for LocalFileSystem { let path = self.config.path_to_filesystem(location)?; maybe_spawn_blocking(move || { let mut file = open_file(&path)?; - let to_read = range.end - range.start; - file.seek(SeekFrom::Start(range.start as u64)) - .context(SeekSnafu { path: &path })?; - - let mut buf = Vec::with_capacity(to_read); - let read = file - .take(to_read as u64) - .read_to_end(&mut buf) - .context(UnableToReadBytesSnafu { path: &path })?; - - ensure!( - read == to_read, - OutOfRangeSnafu { - path: &path, - expected: to_read, - actual: read - } - ); + read_range(&mut file, &path, range) + }) + .await + } - Ok(buf.into()) + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let path = self.config.path_to_filesystem(location)?; + let ranges = ranges.to_vec(); + maybe_spawn_blocking(move || { + // Vectored IO might be faster + let mut file = open_file(&path)?; + ranges + .into_iter() + .map(|r| read_range(&mut file, &path, r)) + .collect() }) .await } @@ -371,7 +398,8 @@ impl ObjectStore for LocalFileSystem { let walkdir = WalkDir::new(&root_path) // Don't include the root directory itself - .min_depth(1); + .min_depth(1) + .follow_links(true); let s = walkdir.into_iter().flat_map(move |result_dir_entry| { match convert_walkdir_result(result_dir_entry) { @@ -433,7 +461,10 @@ impl ObjectStore for LocalFileSystem { let resolved_prefix = config.path_to_filesystem(&prefix)?; maybe_spawn_blocking(move || { - let walkdir = WalkDir::new(&resolved_prefix).min_depth(1).max_depth(1); + let walkdir = WalkDir::new(&resolved_prefix) + .min_depth(1) + .max_depth(1) + .follow_links(true); let mut common_prefixes = BTreeSet::new(); let mut objects = Vec::new(); @@ -732,7 +763,29 @@ impl AsyncWrite for LocalUpload { } } -fn open_file(path: &std::path::PathBuf) -> Result { +fn read_range(file: &mut File, path: &PathBuf, range: Range) -> Result { + let to_read = range.end - range.start; + file.seek(SeekFrom::Start(range.start as u64)) + .context(SeekSnafu { path })?; + + let mut buf = Vec::with_capacity(to_read); + let read = file + .take(to_read as u64) + .read_to_end(&mut buf) + .context(UnableToReadBytesSnafu { path })?; + + ensure!( + read == to_read, + OutOfRangeSnafu { + path, + expected: to_read, + actual: read + } + ); + Ok(buf.into()) +} + +fn open_file(path: &PathBuf) -> Result { let file = File::open(path).map_err(|e| { if e.kind() == std::io::ErrorKind::NotFound { Error::NotFound { @@ -749,7 +802,7 @@ fn open_file(path: &std::path::PathBuf) -> Result { Ok(file) } -fn open_writable_file(path: &std::path::PathBuf) -> Result { +fn open_writable_file(path: &PathBuf) -> Result { match File::create(&path) { Ok(f) => Ok(f), Err(err) if err.kind() == std::io::ErrorKind::NotFound => { @@ -861,7 +914,8 @@ mod tests { }, Error as ObjectStoreError, ObjectStore, }; - use tempfile::TempDir; + use futures::TryStreamExt; + use tempfile::{NamedTempFile, TempDir}; use tokio::io::AsyncWriteExt; #[tokio::test] @@ -869,12 +923,12 @@ mod tests { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; } #[test] @@ -882,10 +936,10 @@ mod tests { let root = TempDir::new().unwrap(); let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); futures::executor::block_on(async move { - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + stream_get(&integration).await; }); } @@ -1030,6 +1084,124 @@ mod tests { } } + async fn check_list( + integration: &LocalFileSystem, + prefix: Option<&Path>, + expected: &[&str], + ) { + let result: Vec<_> = integration + .list(prefix) + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + let mut strings: Vec<_> = result.iter().map(|x| x.location.as_ref()).collect(); + strings.sort_unstable(); + assert_eq!(&strings, expected) + } + + #[tokio::test] + #[cfg(target_family = "unix")] + async fn test_symlink() { + let root = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(root.path()).unwrap(); + + let subdir = root.path().join("a"); + std::fs::create_dir(&subdir).unwrap(); + let file = subdir.join("file.parquet"); + std::fs::write(file, "test").unwrap(); + + check_list(&integration, None, &["a/file.parquet"]).await; + integration + .head(&Path::from("a/file.parquet")) + .await + .unwrap(); + + // Follow out of tree symlink + let other = NamedTempFile::new().unwrap(); + std::os::unix::fs::symlink(other.path(), root.path().join("test.parquet")) + .unwrap(); + + // Should return test.parquet even though out of tree + check_list(&integration, None, &["a/file.parquet", "test.parquet"]).await; + + // Can fetch test.parquet + integration.head(&Path::from("test.parquet")).await.unwrap(); + + // Follow in tree symlink + std::os::unix::fs::symlink(&subdir, root.path().join("b")).unwrap(); + check_list( + &integration, + None, + &["a/file.parquet", "b/file.parquet", "test.parquet"], + ) + .await; + check_list(&integration, Some(&Path::from("b")), &["b/file.parquet"]).await; + + // Can fetch through symlink + integration + .head(&Path::from("b/file.parquet")) + .await + .unwrap(); + + // Ignore broken symlink + std::os::unix::fs::symlink( + root.path().join("foo.parquet"), + root.path().join("c"), + ) + .unwrap(); + + check_list( + &integration, + None, + &["a/file.parquet", "b/file.parquet", "test.parquet"], + ) + .await; + + let mut r = integration.list_with_delimiter(None).await.unwrap(); + r.common_prefixes.sort_unstable(); + assert_eq!(r.common_prefixes.len(), 2); + assert_eq!(r.common_prefixes[0].as_ref(), "a"); + assert_eq!(r.common_prefixes[1].as_ref(), "b"); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location.as_ref(), "test.parquet"); + + let r = integration + .list_with_delimiter(Some(&Path::from("a"))) + .await + .unwrap(); + assert_eq!(r.common_prefixes.len(), 0); + assert_eq!(r.objects.len(), 1); + assert_eq!(r.objects[0].location.as_ref(), "a/file.parquet"); + + // Deleting a symlink doesn't delete the source file + integration + .delete(&Path::from("test.parquet")) + .await + .unwrap(); + assert!(other.path().exists()); + + check_list(&integration, None, &["a/file.parquet", "b/file.parquet"]).await; + + // Deleting through a symlink deletes both files + integration + .delete(&Path::from("b/file.parquet")) + .await + .unwrap(); + + check_list(&integration, None, &[]).await; + + // Adding a file through a symlink creates in both paths + integration + .put(&Path::from("b/file.parquet"), Bytes::from(vec![0, 1, 2])) + .await + .unwrap(); + + check_list(&integration, None, &["a/file.parquet", "b/file.parquet"]).await; + } + #[tokio::test] async fn invalid_path() { let root = TempDir::new().unwrap(); @@ -1075,7 +1247,7 @@ mod tests { .to_string(); assert!( - err.contains("Invalid path segment - got \"💀\" expected: \"%F0%9F%92%80\""), + err.contains("Encountered illegal character sequence \"💀\" whilst parsing path segment \"💀\""), "{}", err ); @@ -1110,4 +1282,33 @@ mod tests { 0 ); } + + #[tokio::test] + async fn filesystem_filename_with_percent() { + let temp_dir = TempDir::new().unwrap(); + let integration = LocalFileSystem::new_with_prefix(temp_dir.path()).unwrap(); + let filename = "L%3ABC.parquet"; + + std::fs::write(temp_dir.path().join(filename), "foo").unwrap(); + + let list_stream = integration.list(None).await.unwrap(); + let res: Vec<_> = list_stream.try_collect().await.unwrap(); + assert_eq!(res.len(), 1); + assert_eq!(res[0].location.as_ref(), filename); + + let res = integration.list_with_delimiter(None).await.unwrap(); + assert_eq!(res.objects.len(), 1); + assert_eq!(res.objects[0].location.as_ref(), filename); + } + + #[tokio::test] + async fn relative_paths() { + LocalFileSystem::new_with_prefix(".").unwrap(); + LocalFileSystem::new_with_prefix("..").unwrap(); + LocalFileSystem::new_with_prefix("../..").unwrap(); + + let integration = LocalFileSystem::new(); + let path = Path::from_filesystem_path(".").unwrap(); + integration.list_with_delimiter(Some(&path)).await.unwrap(); + } } diff --git a/object_store/src/memory.rs b/object_store/src/memory.rs index dc3967d9915f..e4be5b2afddf 100644 --- a/object_store/src/memory.rs +++ b/object_store/src/memory.rs @@ -128,6 +128,22 @@ impl ObjectStore for InMemory { Ok(data.slice(range)) } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let data = self.get_bytes(location).await?; + ranges + .iter() + .map(|range| { + ensure!(range.end <= data.len(), OutOfRangeSnafu); + ensure!(range.start <= range.end, BadRangeSnafu); + Ok(data.slice(range.clone())) + }) + .collect() + } + async fn head(&self, location: &Path) -> Result { let last_modified = Utc::now(); let bytes = self.get_bytes(location).await?; @@ -305,12 +321,12 @@ mod tests { async fn in_memory_test() { let integration = InMemory::new(); - put_get_delete_list(&integration).await.unwrap(); - list_uses_directories_correctly(&integration).await.unwrap(); - list_with_delimiter(&integration).await.unwrap(); - rename_and_copy(&integration).await.unwrap(); - copy_if_not_exists(&integration).await.unwrap(); - stream_get(&integration).await.unwrap(); + put_get_delete_list(&integration).await; + list_uses_directories_correctly(&integration).await; + list_with_delimiter(&integration).await; + rename_and_copy(&integration).await; + copy_if_not_exists(&integration).await; + stream_get(&integration).await; } #[tokio::test] diff --git a/object_store/src/path/mod.rs b/object_store/src/path/mod.rs index 23488ef660c5..e5a7b6443bb1 100644 --- a/object_store/src/path/mod.rs +++ b/object_store/src/path/mod.rs @@ -126,7 +126,6 @@ pub enum Error { /// Path::parse("..").unwrap_err(); /// Path::parse("/foo//").unwrap_err(); /// Path::parse("😀").unwrap_err(); -/// Path::parse("%Q").unwrap_err(); /// ``` /// /// [RFC 1738]: https://www.ietf.org/rfc/rfc1738.txt @@ -163,24 +162,38 @@ impl Path { /// Convert a filesystem path to a [`Path`] relative to the filesystem root /// - /// This will return an error if the path does not exist, or contains illegal - /// character sequences as defined by [`Path::parse`] + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`] or does not exist + /// + /// Note: this will canonicalize the provided path, resolving any symlinks pub fn from_filesystem_path( path: impl AsRef, ) -> Result { - Self::from_filesystem_path_with_base(path, None) + let absolute = std::fs::canonicalize(&path).context(CanonicalizeSnafu { + path: path.as_ref(), + })?; + + Self::from_absolute_path(absolute) + } + + /// Convert an absolute filesystem path to a [`Path`] relative to the filesystem root + /// + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`], or `base` is not an absolute path + pub fn from_absolute_path(path: impl AsRef) -> Result { + Self::from_absolute_path_with_base(path, None) } /// Convert a filesystem path to a [`Path`] relative to the provided base /// - /// This will return an error if the path does not exist on the local filesystem, - /// contains illegal character sequences as defined by [`Path::parse`], or `base` - /// does not refer to a parent path of `path` - pub(crate) fn from_filesystem_path_with_base( + /// This will return an error if the path contains illegal character sequences + /// as defined by [`Path::parse`], or `base` does not refer to a parent path of `path`, + /// or `base` is not an absolute path + pub(crate) fn from_absolute_path_with_base( path: impl AsRef, base: Option<&Url>, ) -> Result { - let url = filesystem_path_to_url(path)?; + let url = absolute_path_to_url(path)?; let path = match base { Some(prefix) => url.path().strip_prefix(prefix.path()).ok_or_else(|| { Error::PrefixMismatch { @@ -295,20 +308,13 @@ where } } -/// Given a filesystem path, convert it to its canonical URL representation, -/// returning an error if the file doesn't exist on the local filesystem -pub(crate) fn filesystem_path_to_url( +/// Given an absolute filesystem path convert it to a URL representation without canonicalization +pub(crate) fn absolute_path_to_url( path: impl AsRef, ) -> Result { - let path = path.as_ref().canonicalize().context(CanonicalizeSnafu { - path: path.as_ref(), - })?; - - match path.is_dir() { - true => Url::from_directory_path(&path), - false => Url::from_file_path(&path), - } - .map_err(|_| Error::InvalidPath { path }) + Url::from_file_path(&path).map_err(|_| Error::InvalidPath { + path: path.as_ref().into(), + }) } #[cfg(test)] diff --git a/object_store/src/path/parts.rs b/object_store/src/path/parts.rs index e73b184fc15c..9da4815712db 100644 --- a/object_store/src/path/parts.rs +++ b/object_store/src/path/parts.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use percent_encoding::{percent_decode, percent_encode, AsciiSet, CONTROLS}; +use percent_encoding::{percent_encode, AsciiSet, CONTROLS}; use std::borrow::Cow; use crate::path::DELIMITER_BYTE; @@ -23,11 +23,15 @@ use snafu::Snafu; /// Error returned by [`PathPart::parse`] #[derive(Debug, Snafu)] -#[snafu(display("Invalid path segment - got \"{}\" expected: \"{}\"", actual, expected))] +#[snafu(display( + "Encountered illegal character sequence \"{}\" whilst parsing path segment \"{}\"", + illegal, + segment +))] #[allow(missing_copy_implementations)] pub struct InvalidPart { - actual: String, - expected: String, + segment: String, + illegal: String, } /// The PathPart type exists to validate the directory/file names that form part @@ -43,21 +47,40 @@ pub struct PathPart<'a> { impl<'a> PathPart<'a> { /// Parse the provided path segment as a [`PathPart`] returning an error if invalid pub fn parse(segment: &'a str) -> Result { - let decoded: Cow<'a, [u8]> = percent_decode(segment.as_bytes()).into(); - let part = PathPart::from(decoded.as_ref()); - if segment != part.as_ref() { + if segment == "." || segment == ".." { return Err(InvalidPart { - actual: segment.to_string(), - expected: part.raw.to_string(), + segment: segment.to_string(), + illegal: segment.to_string(), }); } + for (idx, b) in segment.as_bytes().iter().cloned().enumerate() { + // A percent character is always valid, even if not + // followed by a valid 2-digit hex code + // https://url.spec.whatwg.org/#percent-encoded-bytes + if b == b'%' { + continue; + } + + if !b.is_ascii() || should_percent_encode(b) { + return Err(InvalidPart { + segment: segment.to_string(), + // This is correct as only single byte characters up to this point + illegal: segment.chars().nth(idx).unwrap().to_string(), + }); + } + } + Ok(Self { raw: segment.into(), }) } } +fn should_percent_encode(c: u8) -> bool { + percent_encode(&[c], INVALID).next().unwrap().len() != 1 +} + /// Characters we want to encode. const INVALID: &AsciiSet = &CONTROLS // The delimiter we are reserving for internal hierarchy @@ -145,4 +168,18 @@ mod tests { let part: PathPart<'_> = "..".into(); assert_eq!(part.raw, "%2E%2E"); } + + #[test] + fn path_part_parse() { + PathPart::parse("foo").unwrap(); + PathPart::parse("foo/bar").unwrap_err(); + + // Test percent-encoded path + PathPart::parse("foo%2Fbar").unwrap(); + PathPart::parse("L%3ABC.parquet").unwrap(); + + // Test path containing bad escape sequence + PathPart::parse("%Z").unwrap(); + PathPart::parse("%%").unwrap(); + } } diff --git a/object_store/src/throttle.rs b/object_store/src/throttle.rs index 6789f0e68df9..90f427cc2651 100644 --- a/object_store/src/throttle.rs +++ b/object_store/src/throttle.rs @@ -197,7 +197,7 @@ impl ObjectStore for ThrottledStore { async fn get_range(&self, location: &Path, range: Range) -> Result { let config = self.config(); - let sleep_duration = config.wait_delete_per_call + let sleep_duration = config.wait_get_per_call + config.wait_get_per_byte * (range.end - range.start) as u32; sleep(sleep_duration).await; @@ -205,6 +205,22 @@ impl ObjectStore for ThrottledStore { self.inner.get_range(location, range).await } + async fn get_ranges( + &self, + location: &Path, + ranges: &[Range], + ) -> Result> { + let config = self.config(); + + let total_bytes: usize = ranges.iter().map(|range| range.end - range.start).sum(); + let sleep_duration = + config.wait_get_per_call + config.wait_get_per_byte * total_bytes as u32; + + sleep(sleep_duration).await; + + self.inner.get_ranges(location, ranges).await + } + async fn head(&self, location: &Path) -> Result { sleep(self.config().wait_put_per_call).await; self.inner.head(location).await @@ -260,11 +276,23 @@ impl ObjectStore for ThrottledStore { self.inner.copy(from, to).await } + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.rename(from, to).await + } + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { sleep(self.config().wait_put_per_call).await; self.inner.copy_if_not_exists(from, to).await } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + sleep(self.config().wait_put_per_call).await; + + self.inner.rename_if_not_exists(from, to).await + } } /// Saturated `usize` to `u32` cast. @@ -308,11 +336,11 @@ mod tests { let inner = InMemory::new(); let store = ThrottledStore::new(inner, ThrottleConfig::default()); - put_get_delete_list(&store).await.unwrap(); - list_uses_directories_correctly(&store).await.unwrap(); - list_with_delimiter(&store).await.unwrap(); - rename_and_copy(&store).await.unwrap(); - copy_if_not_exists(&store).await.unwrap(); + put_get_delete_list(&store).await; + list_uses_directories_correctly(&store).await; + list_with_delimiter(&store).await; + rename_and_copy(&store).await; + copy_if_not_exists(&store).await; } #[tokio::test] diff --git a/object_store/src/util.rs b/object_store/src/util.rs index 4f3ed86fdc69..46e9e9ed8771 100644 --- a/object_store/src/util.rs +++ b/object_store/src/util.rs @@ -71,3 +71,98 @@ where Err(_) => f(), } } + +/// Range requests with a gap less than or equal to this, +/// will be coalesced into a single request by [`coalesce_ranges`] +pub const OBJECT_STORE_COALESCE_DEFAULT: usize = 1024 * 1024; + +/// Takes a function to fetch ranges and coalesces adjacent ranges if they are +/// less than `coalesce` bytes apart. Out of order `ranges` are not coalesced +pub async fn coalesce_ranges( + ranges: &[std::ops::Range], + mut fetch: F, + coalesce: usize, +) -> Result> +where + F: Send + FnMut(std::ops::Range) -> Fut, + Fut: std::future::Future> + Send, +{ + let mut ret = Vec::with_capacity(ranges.len()); + let mut start_idx = 0; + let mut end_idx = 1; + + while start_idx != ranges.len() { + while end_idx != ranges.len() + && ranges[end_idx] + .start + .checked_sub(ranges[start_idx].end) + .map(|delta| delta <= coalesce) + .unwrap_or(false) + { + end_idx += 1; + } + + let start = ranges[start_idx].start; + let end = ranges[end_idx - 1].end; + let bytes = fetch(start..end).await?; + for range in ranges.iter().take(end_idx).skip(start_idx) { + ret.push(bytes.slice(range.start - start..range.end - start)) + } + start_idx = end_idx; + end_idx += 1; + } + Ok(ret) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ops::Range; + + #[tokio::test] + async fn test_coalesce_ranges() { + let do_fetch = |ranges: Vec>, coalesce: usize| async move { + let max = ranges.iter().map(|x| x.end).max().unwrap_or(0); + let src: Vec<_> = (0..max).map(|x| x as u8).collect(); + + let mut fetches = vec![]; + let coalesced = coalesce_ranges( + &ranges, + |range| { + fetches.push(range.clone()); + futures::future::ready(Ok(Bytes::from(src[range].to_vec()))) + }, + coalesce, + ) + .await + .unwrap(); + + assert_eq!(ranges.len(), coalesced.len()); + for (range, bytes) in ranges.iter().zip(coalesced) { + assert_eq!(bytes.as_ref(), &src[range.clone()]); + } + fetches + }; + + let fetches = do_fetch(vec![], 0).await; + assert_eq!(fetches, vec![]); + + let fetches = do_fetch(vec![0..3], 0).await; + assert_eq!(fetches, vec![0..3]); + + let fetches = do_fetch(vec![0..2, 3..5], 0).await; + assert_eq!(fetches, vec![0..2, 3..5]); + + let fetches = do_fetch(vec![0..1, 1..2], 0).await; + assert_eq!(fetches, vec![0..2]); + + let fetches = do_fetch(vec![0..1, 2..72], 1).await; + assert_eq!(fetches, vec![0..72]); + + let fetches = do_fetch(vec![0..1, 56..72, 73..75], 1).await; + assert_eq!(fetches, vec![0..1, 56..75]); + + let fetches = do_fetch(vec![0..1, 5..6, 7..9, 2..3, 4..6], 1).await; + assert_eq!(fetches, vec![0..1, 5..9, 2..6]); + } +} diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index a0e164150aa8..5a8e4c485328 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet" -version = "19.0.0" +version = "20.0.0" license = "Apache-2.0" description = "Apache Parquet implementation in Rust" homepage = "https://github.com/apache/arrow-rs" @@ -30,7 +30,7 @@ edition = "2021" rust-version = "1.62" [dependencies] -ahash = "0.7" +ahash = "0.8" parquet-format = { version = "4.0.0", default-features = false } bytes = { version = "1.1", default-features = false, features = ["std"] } byteorder = { version = "1", default-features = false } @@ -43,10 +43,11 @@ zstd = { version = "0.11.1", optional = true, default-features = false } chrono = { version = "0.4", default-features = false, features = ["alloc"] } num = { version = "0.4", default-features = false } num-bigint = { version = "0.4", default-features = false } -arrow = { path = "../arrow", version = "19.0.0", optional = true, default-features = false, features = ["ipc"] } +arrow = { path = "../arrow", version = "20.0.0", optional = true, default-features = false, features = ["ipc"] } base64 = { version = "0.13", default-features = false, features = ["std"], optional = true } clap = { version = "3", default-features = false, features = ["std", "derive", "env"], optional = true } serde_json = { version = "1.0", default-features = false, features = ["std"], optional = true } +seq-macro = { version = "0.3", default-features = false } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] } futures = { version = "0.3", default-features = false, features = ["std"], optional = true } tokio = { version = "1.0", optional = true, default-features = false, features = ["macros", "fs", "rt", "io-util"] } @@ -62,7 +63,7 @@ flate2 = { version = "1.0", default-features = false, features = ["rust_backend" lz4 = { version = "1.23", default-features = false } zstd = { version = "0.11", default-features = false } serde_json = { version = "1.0", features = ["std"], default-features = false } -arrow = { path = "../arrow", version = "19.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint"] } +arrow = { path = "../arrow", version = "20.0.0", default-features = false, features = ["ipc", "test_utils", "prettyprint"] } [package.metadata.docs.rs] all-features = true @@ -72,7 +73,9 @@ default = ["arrow", "snap", "brotli", "flate2", "lz4", "zstd", "base64"] # Enable arrow reader/writer APIs arrow = ["dep:arrow", "base64"] # Enable CLI tools -cli = ["serde_json", "base64", "clap", "arrow/csv"] +cli = ["json", "base64", "clap", "arrow/csv"] +# Enable JSON APIs +json = ["serde_json"] # Enable internal testing APIs test_common = ["arrow/test_utils"] # Experimental, unstable functionality primarily used for testing diff --git a/parquet/README.md b/parquet/README.md index fbb6e3e1b5d5..689a664b6326 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -19,17 +19,38 @@ # Apache Parquet Official Native Rust Implementation -[![Crates.io](https://img.shields.io/crates/v/parquet.svg)](https://crates.io/crates/parquet) +[![crates.io](https://img.shields.io/crates/v/parquet.svg)](https://crates.io/crates/parquet) +[![docs.rs](https://img.shields.io/docsrs/parquet.svg)](https://docs.rs/parquet/latest/parquet/) This crate contains the official Native Rust implementation of [Apache Parquet](https://parquet.apache.org/), which is part of the [Apache Arrow](https://arrow.apache.org/) project. See [crate documentation](https://docs.rs/parquet/latest/parquet/) for examples and the full API. -## Rust Version Compatbility +## Rust Version Compatibility This crate is tested with the latest stable version of Rust. We do not currently test against other, older versions of the Rust compiler. -## Features +## Versioning / Releases + +The arrow crate follows the [SemVer standard](https://doc.rust-lang.org/cargo/reference/semver.html) defined by Cargo and works well within the Rust crate ecosystem. + +However, for historical reasons, this crate uses versions with major numbers greater than `0.x` (e.g. `19.0.0`), unlike many other crates in the Rust ecosystem which spend extended time releasing versions `0.x` to signal planned ongoing API changes. Minor arrow releases contain only compatible changes, while major releases may contain breaking API changes. + +## Feature Flags + +The `parquet` crate provides the following features which may be enabled in your `Cargo.toml`: + +- `arrow` (default) - support for reading / writing [`arrow`](https://crates.io/crates/arrow) arrays to / from parquet +- `async` - support `async` APIs for reading parquet +- `json` - support for reading / writing `json` data to / from parquet +- `brotli` (default) - support for parquet using `brotli` compression +- `flate2` (default) - support for parquet using `gzip` compression +- `lz4` (default) - support for parquet using `lz4` compression +- `zstd` (default) - support for parquet using `zstd` compression +- `cli` - parquet [CLI tools](https://github.com/apache/arrow-rs/tree/master/parquet/src/bin) +- `experimental` - Experimental APIs which may change, even between minor releases + +## Parquet Feature Status - [x] All encodings supported - [x] All compression codecs supported diff --git a/parquet/benches/arrow_reader.rs b/parquet/benches/arrow_reader.rs index dc2ed8355659..a3c904505c25 100644 --- a/parquet/benches/arrow_reader.rs +++ b/parquet/benches/arrow_reader.rs @@ -300,6 +300,26 @@ fn bench_array_reader(mut array_reader: Box) -> usize { total_count } +fn bench_array_reader_skip(mut array_reader: Box) -> usize { + // test procedure: read data in batches of 8192 until no more data + let mut total_count = 0; + let mut skip = false; + let mut array_len; + loop { + if skip { + array_len = array_reader.skip_records(BATCH_SIZE).unwrap(); + } else { + let array = array_reader.next_batch(BATCH_SIZE); + array_len = array.unwrap().len(); + } + total_count += array_len; + skip = !skip; + if array_len < BATCH_SIZE { + break; + } + } + total_count +} fn create_primitive_array_reader( page_iterator: impl PageIterator + 'static, column_desc: ColumnDescPtr, @@ -445,6 +465,39 @@ fn bench_primitive( assert_eq!(count, EXPECTED_VALUE_COUNT); }); + // binary packed skip , no NULLs + let data = build_encoded_primitive_page_iterator::( + schema.clone(), + mandatory_column_desc.clone(), + 0.0, + Encoding::DELTA_BINARY_PACKED, + ); + group.bench_function("binary packed skip, mandatory, no NULLs", |b| { + b.iter(|| { + let array_reader = create_primitive_array_reader( + data.clone(), + mandatory_column_desc.clone(), + ); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + + let data = build_encoded_primitive_page_iterator::( + schema.clone(), + optional_column_desc.clone(), + 0.0, + Encoding::DELTA_BINARY_PACKED, + ); + group.bench_function("binary packed skip, optional, no NULLs", |b| { + b.iter(|| { + let array_reader = + create_primitive_array_reader(data.clone(), optional_column_desc.clone()); + count = bench_array_reader_skip(array_reader); + }); + assert_eq!(count, EXPECTED_VALUE_COUNT); + }); + // binary packed, half NULLs let data = build_encoded_primitive_page_iterator::( schema.clone(), diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index 25ff1ca90dc6..ddca1e53c6de 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -92,6 +92,25 @@ fn create_string_bench_batch( )?) } +fn create_string_dictionary_bench_batch( + size: usize, + null_density: f32, + true_density: f32, +) -> Result { + let fields = vec![Field::new( + "_1", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + true, + )]; + let schema = Schema::new(fields); + Ok(create_random_batch( + Arc::new(schema), + size, + null_density, + true_density, + )?) +} + fn create_string_bench_batch_non_null( size: usize, null_density: f32, @@ -346,6 +365,18 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch(&batch).unwrap()) }); + let batch = create_string_dictionary_bench_batch(4096, 0.25, 0.75).unwrap(); + group.throughput(Throughput::Bytes( + batch + .columns() + .iter() + .map(|f| f.get_array_memory_size() as u64) + .sum(), + )); + group.bench_function("4096 values string dictionary", |b| { + b.iter(|| write_batch(&batch).unwrap()) + }); + let batch = create_string_bench_batch_non_null(4096, 0.25, 0.75).unwrap(); group.throughput(Throughput::Bytes( batch diff --git a/parquet/src/arrow/array_reader/builder.rs b/parquet/src/arrow/array_reader/builder.rs index 7a19d5fbc601..e389158a1931 100644 --- a/parquet/src/arrow/array_reader/builder.rs +++ b/parquet/src/arrow/array_reader/builder.rs @@ -25,26 +25,32 @@ use crate::arrow::array_reader::{ ComplexObjectArrayReader, ListArrayReader, MapArrayReader, NullArrayReader, PrimitiveArrayReader, RowGroupCollection, StructArrayReader, }; -use crate::arrow::buffer::converter::{DecimalArrayConverter, DecimalByteArrayConvert, DecimalFixedLengthByteArrayConverter, FixedLenBinaryConverter, FixedSizeArrayConverter, Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter, IntervalDayTimeConverter, IntervalYearMonthArrayConverter, IntervalYearMonthConverter}; +use crate::arrow::buffer::converter::{ + DecimalArrayConverter, DecimalByteArrayConvert, DecimalFixedLengthByteArrayConverter, + FixedLenBinaryConverter, FixedSizeArrayConverter, Int96ArrayConverter, + Int96Converter, IntervalDayTimeArrayConverter, IntervalDayTimeConverter, + IntervalYearMonthArrayConverter, IntervalYearMonthConverter, +}; use crate::arrow::schema::{convert_schema, ParquetField, ParquetFieldType}; use crate::arrow::ProjectionMask; use crate::basic::Type as PhysicalType; -use crate::data_type::{BoolType, ByteArrayType, DoubleType, FixedLenByteArrayType, FloatType, Int32Type, Int64Type, Int96Type}; +use crate::data_type::{ + BoolType, ByteArrayType, DoubleType, FixedLenByteArrayType, FloatType, Int32Type, + Int64Type, Int96Type, +}; use crate::errors::Result; -use crate::schema::types::{ColumnDescriptor, ColumnPath, SchemaDescPtr, Type}; +use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; /// Create array reader from parquet schema, projection mask, and parquet file reader. pub fn build_array_reader( - parquet_schema: SchemaDescPtr, arrow_schema: SchemaRef, mask: ProjectionMask, - row_groups: Box, + row_groups: &dyn RowGroupCollection, ) -> Result> { - let field = - convert_schema(parquet_schema.as_ref(), mask, Some(arrow_schema.as_ref()))?; + let field = convert_schema(&row_groups.schema(), mask, Some(arrow_schema.as_ref()))?; match &field { - Some(field) => build_reader(field, row_groups.as_ref()), + Some(field) => build_reader(field, row_groups), None => Ok(make_empty_array_reader(row_groups.num_rows())), } } @@ -96,13 +102,11 @@ fn build_list_reader( let data_type = field.arrow_type.clone(); let item_reader = build_reader(&children[0], row_groups)?; - let item_type = item_reader.get_data_type().clone(); match is_large { false => Ok(Box::new(ListArrayReader::::new( item_reader, data_type, - item_type, field.def_level, field.rep_level, field.nullable, @@ -110,7 +114,6 @@ fn build_list_reader( true => Ok(Box::new(ListArrayReader::::new( item_reader, data_type, - item_type, field.def_level, field.rep_level, field.nullable, @@ -155,13 +158,11 @@ fn build_primitive_reader( let arrow_type = Some(field.arrow_type.clone()); match physical_type { - PhysicalType::BOOLEAN => Ok(Box::new( - PrimitiveArrayReader::::new( - page_iterator, - column_desc, - arrow_type, - )?, - )), + PhysicalType::BOOLEAN => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + arrow_type, + )?)), PhysicalType::INT32 => { if let Some(DataType::Null) = arrow_type { Ok(Box::new(NullArrayReader::::new( @@ -169,22 +170,18 @@ fn build_primitive_reader( column_desc, )?)) } else { - Ok(Box::new( - PrimitiveArrayReader::::new( - page_iterator, - column_desc, - arrow_type, - )?, - )) + Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + arrow_type, + )?)) } } - PhysicalType::INT64 => Ok(Box::new( - PrimitiveArrayReader::::new( - page_iterator, - column_desc, - arrow_type, - )?, - )), + PhysicalType::INT64 => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + arrow_type, + )?)), PhysicalType::INT96 => { // get the optional timezone information from arrow type let timezone = arrow_type.as_ref().and_then(|data_type| { @@ -205,50 +202,40 @@ fn build_primitive_reader( arrow_type, )?)) } - PhysicalType::FLOAT => Ok(Box::new( - PrimitiveArrayReader::::new( - page_iterator, - column_desc, - arrow_type, - )?, - )), - PhysicalType::DOUBLE => Ok(Box::new( - PrimitiveArrayReader::::new( - page_iterator, - column_desc, - arrow_type, - )?, - )), + PhysicalType::FLOAT => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + arrow_type, + )?)), + PhysicalType::DOUBLE => Ok(Box::new(PrimitiveArrayReader::::new( + page_iterator, + column_desc, + arrow_type, + )?)), PhysicalType::BYTE_ARRAY => match arrow_type { - Some(DataType::Dictionary(_, _)) => make_byte_array_dictionary_reader( - page_iterator, - column_desc, - arrow_type, - ), - Some(DataType::Decimal(precision, scale)) => { + Some(DataType::Dictionary(_, _)) => { + make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type) + } + Some(DataType::Decimal128(precision, scale)) => { // read decimal data from parquet binary physical type - let convert = DecimalByteArrayConvert::new(DecimalArrayConverter::new(precision as i32, scale as i32)); - Ok(Box::new( - ComplexObjectArrayReader::::new( - page_iterator, - column_desc, - convert, - arrow_type - )? - )) - }, - _ => make_byte_array_reader( - page_iterator, - column_desc, - arrow_type, - ), - }, - PhysicalType::FIXED_LEN_BYTE_ARRAY => match field.arrow_type { - DataType::Decimal(precision, scale) => { - let converter = DecimalFixedLengthByteArrayConverter::new(DecimalArrayConverter::new( + let convert = DecimalByteArrayConvert::new(DecimalArrayConverter::new( precision as i32, scale as i32, )); + Ok(Box::new(ComplexObjectArrayReader::< + ByteArrayType, + DecimalByteArrayConvert, + >::new( + page_iterator, column_desc, convert, arrow_type + )?)) + } + _ => make_byte_array_reader(page_iterator, column_desc, arrow_type), + }, + PhysicalType::FIXED_LEN_BYTE_ARRAY => match field.arrow_type { + DataType::Decimal128(precision, scale) => { + let converter = DecimalFixedLengthByteArrayConverter::new( + DecimalArrayConverter::new(precision as i32, scale as i32), + ); Ok(Box::new(ComplexObjectArrayReader::< FixedLenByteArrayType, DecimalFixedLengthByteArrayConverter, @@ -326,7 +313,7 @@ mod tests { use super::*; use crate::arrow::parquet_to_arrow_schema; use crate::file::reader::{FileReader, SerializedFileReader}; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; use arrow::datatypes::Field; use std::sync::Arc; @@ -344,13 +331,8 @@ mod tests { ) .unwrap(); - let array_reader = build_array_reader( - file_reader.metadata().file_metadata().schema_descr_ptr(), - Arc::new(arrow_schema), - mask, - Box::new(file_reader), - ) - .unwrap(); + let array_reader = + build_array_reader(Arc::new(arrow_schema), mask, &file_reader).unwrap(); // Create arrow types let arrow_type = DataType::Struct(vec![Field::new( diff --git a/parquet/src/arrow/array_reader/byte_array.rs b/parquet/src/arrow/array_reader/byte_array.rs index ec4188890ef5..172aeb96d6d1 100644 --- a/parquet/src/arrow/array_reader/byte_array.rs +++ b/parquet/src/arrow/array_reader/byte_array.rs @@ -108,8 +108,11 @@ impl ArrayReader for ByteArrayReader { &self.data_type } - fn next_batch(&mut self, batch_size: usize) -> Result { - read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?; + fn read_records(&mut self, batch_size: usize) -> Result { + read_records(&mut self.record_reader, self.pages.as_mut(), batch_size) + } + + fn consume_batch(&mut self) -> Result { let buffer = self.record_reader.consume_record_data(); let null_buffer = self.record_reader.consume_bitmap_buffer(); self.def_levels_buffer = self.record_reader.consume_def_levels(); diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs b/parquet/src/arrow/array_reader/byte_array_dictionary.rs index 51ef38d0d073..0a5d94fa6ae8 100644 --- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs +++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs @@ -25,7 +25,7 @@ use arrow::buffer::Buffer; use arrow::datatypes::{ArrowNativeType, DataType as ArrowType}; use crate::arrow::array_reader::byte_array::{ByteArrayDecoder, ByteArrayDecoderPlain}; -use crate::arrow::array_reader::{read_records, ArrayReader, skip_records}; +use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; use crate::arrow::buffer::{ dictionary_buffer::DictionaryBuffer, offset_buffer::OffsetBuffer, }; @@ -167,8 +167,11 @@ where &self.data_type } - fn next_batch(&mut self, batch_size: usize) -> Result { - read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?; + fn read_records(&mut self, batch_size: usize) -> Result { + read_records(&mut self.record_reader, self.pages.as_mut(), batch_size) + } + + fn consume_batch(&mut self) -> Result { let buffer = self.record_reader.consume_record_data(); let null_buffer = self.record_reader.consume_bitmap_buffer(); let array = buffer.into_array(null_buffer, &self.data_type)?; diff --git a/parquet/src/arrow/array_reader/complex_object_array.rs b/parquet/src/arrow/array_reader/complex_object_array.rs index 1390866cf6a5..4f958fea446f 100644 --- a/parquet/src/arrow/array_reader/complex_object_array.rs +++ b/parquet/src/arrow/array_reader/complex_object_array.rs @@ -39,9 +39,13 @@ where pages: Box, def_levels_buffer: Option>, rep_levels_buffer: Option>, + data_buffer: Vec, column_desc: ColumnDescPtr, column_reader: Option>, converter: C, + in_progress_def_levels_buffer: Option>, + in_progress_rep_levels_buffer: Option>, + before_consume: bool, _parquet_type_marker: PhantomData, _converter_marker: PhantomData, } @@ -59,7 +63,10 @@ where &self.data_type } - fn next_batch(&mut self, batch_size: usize) -> Result { + fn read_records(&mut self, batch_size: usize) -> Result { + if !self.before_consume { + self.before_consume = true; + } // Try to initialize column reader if self.column_reader.is_none() { self.next_column_reader()?; @@ -126,7 +133,6 @@ where break; } } - data_buffer.truncate(num_read); def_levels_buffer .iter_mut() @@ -135,13 +141,35 @@ where .iter_mut() .for_each(|buf| buf.truncate(num_read)); - self.def_levels_buffer = def_levels_buffer; - self.rep_levels_buffer = rep_levels_buffer; + if let Some(mut def_levels_buffer) = def_levels_buffer { + match &mut self.in_progress_def_levels_buffer { + None => { + self.in_progress_def_levels_buffer = Some(def_levels_buffer); + } + Some(buf) => buf.append(&mut def_levels_buffer), + } + } + + if let Some(mut rep_levels_buffer) = rep_levels_buffer { + match &mut self.in_progress_rep_levels_buffer { + None => { + self.in_progress_rep_levels_buffer = Some(rep_levels_buffer); + } + Some(buf) => buf.append(&mut rep_levels_buffer), + } + } + + self.data_buffer.append(&mut data_buffer); + + Ok(num_read) + } - let data: Vec> = if self.def_levels_buffer.is_some() { + fn consume_batch(&mut self) -> Result { + let data: Vec> = if self.in_progress_def_levels_buffer.is_some() { + let data_buffer = std::mem::take(&mut self.data_buffer); data_buffer .into_iter() - .zip(self.def_levels_buffer.as_ref().unwrap().iter()) + .zip(self.in_progress_def_levels_buffer.as_ref().unwrap().iter()) .map(|(t, def_level)| { if *def_level == self.column_desc.max_def_level() { Some(t) @@ -151,7 +179,7 @@ where }) .collect() } else { - data_buffer.into_iter().map(Some).collect() + self.data_buffer.iter().map(|x| Some(x.clone())).collect() }; let mut array = self.converter.convert(data)?; @@ -160,28 +188,49 @@ where array = arrow::compute::cast(&array, &self.data_type)?; } + self.data_buffer = vec![]; + self.def_levels_buffer = std::mem::take(&mut self.in_progress_def_levels_buffer); + self.rep_levels_buffer = std::mem::take(&mut self.in_progress_rep_levels_buffer); + self.before_consume = false; + Ok(array) } fn skip_records(&mut self, num_records: usize) -> Result { - match self.column_reader.as_mut() { - Some(reader) => reader.skip_records(num_records), - None => { - if self.next_column_reader()? { - self.column_reader.as_mut().unwrap().skip_records(num_records) - }else { - Ok(0) - } + let mut num_read = 0; + while (self.column_reader.is_some() || self.next_column_reader()?) + && num_read < num_records + { + let remain_to_skip = num_records - num_read; + let skip = self + .column_reader + .as_mut() + .unwrap() + .skip_records(remain_to_skip)?; + num_read += skip; + // skip < remain_to_skip means end of row group + // self.next_column_reader() == false means end of file + if skip < remain_to_skip && !self.next_column_reader()? { + break; } } + Ok(num_read) } fn get_def_levels(&self) -> Option<&[i16]> { - self.def_levels_buffer.as_deref() + if self.before_consume { + self.in_progress_def_levels_buffer.as_deref() + } else { + self.def_levels_buffer.as_deref() + } } fn get_rep_levels(&self) -> Option<&[i16]> { - self.rep_levels_buffer.as_deref() + if self.before_consume { + self.in_progress_rep_levels_buffer.as_deref() + } else { + self.rep_levels_buffer.as_deref() + } } } @@ -208,9 +257,13 @@ where pages, def_levels_buffer: None, rep_levels_buffer: None, + data_buffer: vec![], column_desc, column_reader: None, converter, + in_progress_def_levels_buffer: None, + in_progress_rep_levels_buffer: None, + before_consume: true, _parquet_type_marker: PhantomData, _converter_marker: PhantomData, }) @@ -349,30 +402,32 @@ mod tests { let mut accu_len: usize = 0; - let array = array_reader.next_batch(values_per_page / 2).unwrap(); - assert_eq!(array.len(), values_per_page / 2); + let len = array_reader.read_records(values_per_page / 2).unwrap(); + assert_eq!(len, values_per_page / 2); assert_eq!( - Some(&def_levels[accu_len..(accu_len + array.len())]), + Some(&def_levels[accu_len..(accu_len + len)]), array_reader.get_def_levels() ); assert_eq!( - Some(&rep_levels[accu_len..(accu_len + array.len())]), + Some(&rep_levels[accu_len..(accu_len + len)]), array_reader.get_rep_levels() ); - accu_len += array.len(); + accu_len += len; + array_reader.consume_batch().unwrap(); // Read next values_per_page values, the first values_per_page/2 ones are from the first column chunk, // and the last values_per_page/2 ones are from the second column chunk - let array = array_reader.next_batch(values_per_page).unwrap(); - assert_eq!(array.len(), values_per_page); + let len = array_reader.read_records(values_per_page).unwrap(); + assert_eq!(len, values_per_page); assert_eq!( - Some(&def_levels[accu_len..(accu_len + array.len())]), + Some(&def_levels[accu_len..(accu_len + len)]), array_reader.get_def_levels() ); assert_eq!( - Some(&rep_levels[accu_len..(accu_len + array.len())]), + Some(&rep_levels[accu_len..(accu_len + len)]), array_reader.get_rep_levels() ); + let array = array_reader.consume_batch().unwrap(); let strings = array.as_any().downcast_ref::().unwrap(); for i in 0..array.len() { if array.is_valid(i) { @@ -384,19 +439,20 @@ mod tests { assert_eq!(all_values[i + accu_len], None) } } - accu_len += array.len(); + accu_len += len; // Try to read values_per_page values, however there are only values_per_page/2 values - let array = array_reader.next_batch(values_per_page).unwrap(); - assert_eq!(array.len(), values_per_page / 2); + let len = array_reader.read_records(values_per_page).unwrap(); + assert_eq!(len, values_per_page / 2); assert_eq!( - Some(&def_levels[accu_len..(accu_len + array.len())]), + Some(&def_levels[accu_len..(accu_len + len)]), array_reader.get_def_levels() ); assert_eq!( - Some(&rep_levels[accu_len..(accu_len + array.len())]), + Some(&rep_levels[accu_len..(accu_len + len)]), array_reader.get_rep_levels() ); + array_reader.consume_batch().unwrap(); } #[test] @@ -491,31 +547,34 @@ mod tests { let mut accu_len: usize = 0; // println!("---------- reading a batch of {} values ----------", values_per_page / 2); - let array = array_reader.next_batch(values_per_page / 2).unwrap(); - assert_eq!(array.len(), values_per_page / 2); + let len = array_reader.read_records(values_per_page / 2).unwrap(); + assert_eq!(len, values_per_page / 2); assert_eq!( - Some(&def_levels[accu_len..(accu_len + array.len())]), + Some(&def_levels[accu_len..(accu_len + len)]), array_reader.get_def_levels() ); assert_eq!( - Some(&rep_levels[accu_len..(accu_len + array.len())]), + Some(&rep_levels[accu_len..(accu_len + len)]), array_reader.get_rep_levels() ); - accu_len += array.len(); + accu_len += len; + array_reader.consume_batch().unwrap(); // Read next values_per_page values, the first values_per_page/2 ones are from the first column chunk, // and the last values_per_page/2 ones are from the second column chunk // println!("---------- reading a batch of {} values ----------", values_per_page); - let array = array_reader.next_batch(values_per_page).unwrap(); - assert_eq!(array.len(), values_per_page); + //let array = array_reader.next_batch(values_per_page).unwrap(); + let len = array_reader.read_records(values_per_page).unwrap(); + assert_eq!(len, values_per_page); assert_eq!( - Some(&def_levels[accu_len..(accu_len + array.len())]), + Some(&def_levels[accu_len..(accu_len + len)]), array_reader.get_def_levels() ); assert_eq!( - Some(&rep_levels[accu_len..(accu_len + array.len())]), + Some(&rep_levels[accu_len..(accu_len + len)]), array_reader.get_rep_levels() ); + let array = array_reader.consume_batch().unwrap(); let strings = array.as_any().downcast_ref::().unwrap(); for i in 0..array.len() { if array.is_valid(i) { @@ -527,19 +586,20 @@ mod tests { assert_eq!(all_values[i + accu_len], None) } } - accu_len += array.len(); + accu_len += len; // Try to read values_per_page values, however there are only values_per_page/2 values // println!("---------- reading a batch of {} values ----------", values_per_page); - let array = array_reader.next_batch(values_per_page).unwrap(); - assert_eq!(array.len(), values_per_page / 2); + let len = array_reader.read_records(values_per_page).unwrap(); + assert_eq!(len, values_per_page / 2); assert_eq!( - Some(&def_levels[accu_len..(accu_len + array.len())]), + Some(&def_levels[accu_len..(accu_len + len)]), array_reader.get_def_levels() ); assert_eq!( - Some(&rep_levels[accu_len..(accu_len + array.len())]), + Some(&rep_levels[accu_len..(accu_len + len)]), array_reader.get_rep_levels() ); + array_reader.consume_batch().unwrap(); } } diff --git a/parquet/src/arrow/array_reader/empty_array.rs b/parquet/src/arrow/array_reader/empty_array.rs index b06646cc1c6e..abe839b9dc29 100644 --- a/parquet/src/arrow/array_reader/empty_array.rs +++ b/parquet/src/arrow/array_reader/empty_array.rs @@ -33,6 +33,7 @@ pub fn make_empty_array_reader(row_count: usize) -> Box { struct EmptyArrayReader { data_type: ArrowType, remaining_rows: usize, + need_consume_records: usize, } impl EmptyArrayReader { @@ -40,6 +41,7 @@ impl EmptyArrayReader { Self { data_type: ArrowType::Struct(vec![]), remaining_rows: row_count, + need_consume_records: 0, } } } @@ -53,15 +55,19 @@ impl ArrayReader for EmptyArrayReader { &self.data_type } - fn next_batch(&mut self, batch_size: usize) -> Result { + fn read_records(&mut self, batch_size: usize) -> Result { let len = self.remaining_rows.min(batch_size); self.remaining_rows -= len; + self.need_consume_records += len; + Ok(len) + } + fn consume_batch(&mut self) -> Result { let data = ArrayDataBuilder::new(self.data_type.clone()) - .len(len) + .len(self.need_consume_records) .build() .unwrap(); - + self.need_consume_records = 0; Ok(Arc::new(StructArray::from(data))) } diff --git a/parquet/src/arrow/array_reader/list_array.rs b/parquet/src/arrow/array_reader/list_array.rs index 33bd9772a16e..d2fa94611906 100644 --- a/parquet/src/arrow/array_reader/list_array.rs +++ b/parquet/src/arrow/array_reader/list_array.rs @@ -34,7 +34,6 @@ use std::sync::Arc; pub struct ListArrayReader { item_reader: Box, data_type: ArrowType, - item_type: ArrowType, /// The definition level at which this list is not null def_level: i16, /// The repetition level that corresponds to a new value in this array @@ -49,7 +48,6 @@ impl ListArrayReader { pub fn new( item_reader: Box, data_type: ArrowType, - item_type: ArrowType, def_level: i16, rep_level: i16, nullable: bool, @@ -57,7 +55,6 @@ impl ListArrayReader { Self { item_reader, data_type, - item_type, def_level, rep_level, nullable, @@ -78,9 +75,13 @@ impl ArrayReader for ListArrayReader { &self.data_type } - fn next_batch(&mut self, batch_size: usize) -> Result { - let next_batch_array = self.item_reader.next_batch(batch_size)?; + fn read_records(&mut self, batch_size: usize) -> Result { + let size = self.item_reader.read_records(batch_size)?; + Ok(size) + } + fn consume_batch(&mut self) -> Result { + let next_batch_array = self.item_reader.consume_batch()?; if next_batch_array.len() == 0 { return Ok(new_empty_array(&self.data_type)); } @@ -264,10 +265,7 @@ mod tests { item_nullable: bool, ) -> ArrowType { let field = Box::new(Field::new("item", data_type, item_nullable)); - match OffsetSize::IS_LARGE { - true => ArrowType::LargeList(field), - false => ArrowType::List(field), - } + GenericListArray::::DATA_TYPE_CONSTRUCTOR(field) } fn downcast( @@ -303,13 +301,13 @@ mod tests { // ] let l3_item_type = ArrowType::Int32; - let l3_type = list_type::(l3_item_type.clone(), true); + let l3_type = list_type::(l3_item_type, true); let l2_item_type = l3_type.clone(); - let l2_type = list_type::(l2_item_type.clone(), true); + let l2_type = list_type::(l2_item_type, true); let l1_item_type = l2_type.clone(); - let l1_type = list_type::(l1_item_type.clone(), false); + let l1_type = list_type::(l1_item_type, false); let leaf = PrimitiveArray::::from_iter(vec![ Some(1), @@ -386,7 +384,6 @@ mod tests { let l3 = ListArrayReader::::new( Box::new(item_array_reader), l3_type, - l3_item_type, 5, 3, true, @@ -395,7 +392,6 @@ mod tests { let l2 = ListArrayReader::::new( Box::new(l3), l2_type, - l2_item_type, 3, 2, false, @@ -404,7 +400,6 @@ mod tests { let mut l1 = ListArrayReader::::new( Box::new(l2), l1_type, - l1_item_type, 2, 1, true, @@ -455,7 +450,6 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), list_type::(ArrowType::Int32, true), - ArrowType::Int32, 1, 1, false, @@ -508,7 +502,6 @@ mod tests { let mut list_array_reader = ListArrayReader::::new( Box::new(item_array_reader), list_type::(ArrowType::Int32, true), - ArrowType::Int32, 2, 1, true, @@ -589,13 +582,9 @@ mod tests { let schema = file_metadata.schema_descr_ptr(); let mask = ProjectionMask::leaves(&schema, vec![0]); - let mut array_reader = build_array_reader( - schema, - Arc::new(arrow_schema), - mask, - Box::new(file_reader), - ) - .unwrap(); + let mut array_reader = + build_array_reader(Arc::new(arrow_schema), mask, &file_reader) + .unwrap(); let batch = array_reader.next_batch(100).unwrap(); assert_eq!(batch.data_type(), array_reader.get_data_type()); diff --git a/parquet/src/arrow/array_reader/map_array.rs b/parquet/src/arrow/array_reader/map_array.rs index 00c3db41a37c..3ba7f6960ec3 100644 --- a/parquet/src/arrow/array_reader/map_array.rs +++ b/parquet/src/arrow/array_reader/map_array.rs @@ -32,6 +32,7 @@ pub struct MapArrayReader { value_reader: Box, data_type: ArrowType, map_def_level: i16, + #[allow(unused)] map_rep_level: i16, } @@ -47,6 +48,7 @@ impl MapArrayReader { key_reader, value_reader, data_type, + // These are the wrong way round https://github.com/apache/arrow-rs/issues/1699 map_def_level: rep_level, map_rep_level: def_level, } @@ -62,9 +64,21 @@ impl ArrayReader for MapArrayReader { &self.data_type } - fn next_batch(&mut self, batch_size: usize) -> Result { - let key_array = self.key_reader.next_batch(batch_size)?; - let value_array = self.value_reader.next_batch(batch_size)?; + fn read_records(&mut self, batch_size: usize) -> Result { + let key_len = self.key_reader.read_records(batch_size)?; + let value_len = self.value_reader.read_records(batch_size)?; + // Check that key and value have the same lengths + if key_len != value_len { + return Err(general_err!( + "Map key and value should have the same lengths." + )); + } + Ok(key_len) + } + + fn consume_batch(&mut self) -> Result { + let key_array = self.key_reader.consume_batch()?; + let value_array = self.value_reader.consume_batch()?; // Check that key and value have the same lengths let key_length = key_array.len(); diff --git a/parquet/src/arrow/array_reader/mod.rs b/parquet/src/arrow/array_reader/mod.rs index 8bdd6c071c35..54c45a336a37 100644 --- a/parquet/src/arrow/array_reader/mod.rs +++ b/parquet/src/arrow/array_reader/mod.rs @@ -62,7 +62,20 @@ pub trait ArrayReader: Send { fn get_data_type(&self) -> &ArrowType; /// Reads at most `batch_size` records into an arrow array and return it. - fn next_batch(&mut self, batch_size: usize) -> Result; + fn next_batch(&mut self, batch_size: usize) -> Result { + self.read_records(batch_size)?; + self.consume_batch() + } + + /// Reads at most `batch_size` records' bytes into buffer + /// + /// Returns the number of records read, which can be less than `batch_size` if + /// pages is exhausted. + fn read_records(&mut self, batch_size: usize) -> Result; + + /// Consume all currently stored buffer data + /// into an arrow array and return it. + fn consume_batch(&mut self) -> Result; /// Skips over `num_records` records, returning the number of rows skipped fn skip_records(&mut self, num_records: usize) -> Result; @@ -87,7 +100,7 @@ pub trait ArrayReader: Send { /// A collection of row groups pub trait RowGroupCollection { /// Get schema of parquet file. - fn schema(&self) -> Result; + fn schema(&self) -> SchemaDescPtr; /// Get the numer of rows in this collection fn num_rows(&self) -> usize; @@ -97,8 +110,8 @@ pub trait RowGroupCollection { } impl RowGroupCollection for Arc { - fn schema(&self) -> Result { - Ok(self.metadata().file_metadata().schema_descr_ptr()) + fn schema(&self) -> SchemaDescPtr { + self.metadata().file_metadata().schema_descr_ptr() } fn num_rows(&self) -> usize { diff --git a/parquet/src/arrow/array_reader/null_array.rs b/parquet/src/arrow/array_reader/null_array.rs index 63f73d41e4f5..405633f0a823 100644 --- a/parquet/src/arrow/array_reader/null_array.rs +++ b/parquet/src/arrow/array_reader/null_array.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use crate::arrow::array_reader::{read_records, ArrayReader, skip_records}; +use crate::arrow::array_reader::{read_records, skip_records, ArrayReader}; use crate::arrow::record_reader::buffer::ScalarValue; use crate::arrow::record_reader::RecordReader; use crate::column::page::PageIterator; @@ -39,7 +39,6 @@ where pages: Box, def_levels_buffer: Option, rep_levels_buffer: Option, - column_desc: ColumnDescPtr, record_reader: RecordReader, } @@ -50,14 +49,13 @@ where { /// Construct null array reader. pub fn new(pages: Box, column_desc: ColumnDescPtr) -> Result { - let record_reader = RecordReader::::new(column_desc.clone()); + let record_reader = RecordReader::::new(column_desc); Ok(Self { data_type: ArrowType::Null, pages, def_levels_buffer: None, rep_levels_buffer: None, - column_desc, record_reader, }) } @@ -78,10 +76,11 @@ where &self.data_type } - /// Reads at most `batch_size` records into array. - fn next_batch(&mut self, batch_size: usize) -> Result { - read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?; + fn read_records(&mut self, batch_size: usize) -> Result { + read_records(&mut self.record_reader, self.pages.as_mut(), batch_size) + } + fn consume_batch(&mut self) -> Result { // convert to arrays let array = arrow::array::NullArray::new(self.record_reader.num_values()); diff --git a/parquet/src/arrow/array_reader/primitive_array.rs b/parquet/src/arrow/array_reader/primitive_array.rs index 2a59f0326d33..35f523e3d0d7 100644 --- a/parquet/src/arrow/array_reader/primitive_array.rs +++ b/parquet/src/arrow/array_reader/primitive_array.rs @@ -25,8 +25,8 @@ use crate::data_type::DataType; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; use arrow::array::{ - ArrayDataBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder, Decimal128Array, - Float32Array, Float64Array, Int32Array, Int64Array, + ArrayDataBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder, + Decimal128Array, Float32Array, Float64Array, Int32Array, Int64Array, }; use arrow::buffer::Buffer; use arrow::datatypes::DataType as ArrowType; @@ -36,22 +36,21 @@ use std::sync::Arc; /// Primitive array readers are leaves of array reader tree. They accept page iterator /// and read them into primitive arrays. pub struct PrimitiveArrayReader -where - T: DataType, - T::T: ScalarValue, + where + T: DataType, + T::T: ScalarValue, { data_type: ArrowType, pages: Box, def_levels_buffer: Option, rep_levels_buffer: Option, - column_desc: ColumnDescPtr, record_reader: RecordReader, } impl PrimitiveArrayReader -where - T: DataType, - T::T: ScalarValue, + where + T: DataType, + T::T: ScalarValue, { /// Construct primitive array reader. pub fn new( @@ -67,14 +66,13 @@ where .clone(), }; - let record_reader = RecordReader::::new(column_desc.clone()); + let record_reader = RecordReader::::new(column_desc); Ok(Self { data_type, pages, def_levels_buffer: None, rep_levels_buffer: None, - column_desc, record_reader, }) } @@ -82,9 +80,9 @@ where /// Implementation of primitive array reader. impl ArrayReader for PrimitiveArrayReader -where - T: DataType, - T::T: ScalarValue, + where + T: DataType, + T::T: ScalarValue, { fn as_any(&self) -> &dyn Any { self @@ -95,10 +93,11 @@ where &self.data_type } - /// Reads at most `batch_size` records into array. - fn next_batch(&mut self, batch_size: usize) -> Result { - read_records(&mut self.record_reader, self.pages.as_mut(), batch_size)?; + fn read_records(&mut self, batch_size: usize) -> Result { + read_records(&mut self.record_reader, self.pages.as_mut(), batch_size) + } + fn consume_batch(&mut self) -> Result { let target_type = self.get_data_type().clone(); let arrow_data_type = match T::get_physical_type() { PhysicalType::BOOLEAN => ArrowType::Boolean, @@ -177,13 +176,14 @@ where // are datatypes which we must convert explicitly. // These are: // - date64: we should cast int32 to date32, then date32 to date64. + // - decimal: cast in32 to decimal, int64 to decimal let array = match target_type { ArrowType::Date64 => { // this is cheap as it internally reinterprets the data let a = arrow::compute::cast(&array, &ArrowType::Date32)?; arrow::compute::cast(&a, &target_type)? } - ArrowType::Decimal(p, s) => { + ArrowType::Decimal128(p, s) => { let array = match array.data_type() { ArrowType::Int32 => array .as_any() @@ -204,10 +204,10 @@ where return Err(arrow_err!( "Cannot convert {:?} to decimal", array.data_type() - )) + )); } } - .with_precision_and_scale(p, s)?; + .with_precision_and_scale(p, s)?; Arc::new(array) as ArrayRef } @@ -240,17 +240,19 @@ mod tests { use crate::arrow::array_reader::test_util::EmptyPageIterator; use crate::basic::Encoding; use crate::column::page::Page; - use crate::data_type::Int32Type; + use crate::data_type::{Int32Type, Int64Type}; use crate::schema::parser::parse_message_type; use crate::schema::types::SchemaDescriptor; - use crate::util::test_common::make_pages; + use crate::util::test_common::rand_gen::make_pages; use crate::util::InMemoryPageIterator; - use arrow::array::PrimitiveArray; - use arrow::datatypes::ArrowPrimitiveType; + use arrow::array::{Array, PrimitiveArray}; + use arrow::datatypes::{ArrowPrimitiveType}; use rand::distributions::uniform::SampleUniform; use std::collections::VecDeque; + use arrow::datatypes::DataType::Decimal128; + #[allow(clippy::too_many_arguments)] fn make_column_chunks( column_desc: ColumnDescPtr, encoding: Encoding, @@ -314,7 +316,7 @@ mod tests { column_desc, None, ) - .unwrap(); + .unwrap(); // expect no values to be read let array = array_reader.next_batch(50).unwrap(); @@ -361,7 +363,7 @@ mod tests { column_desc, None, ) - .unwrap(); + .unwrap(); // Read first 50 values, which are all from the first column chunk let array = array_reader.next_batch(50).unwrap(); @@ -561,7 +563,7 @@ mod tests { column_desc, None, ) - .unwrap(); + .unwrap(); let mut accu_len: usize = 0; @@ -602,4 +604,110 @@ mod tests { ); } } + + + #[test] + fn test_primitive_array_reader_decimal_types() { + // parquet `INT32` to decimal + let message_type = " + message test_schema { + REQUIRED INT32 decimal1 (DECIMAL(8,2)); + } + "; + let schema = parse_message_type(message_type) + .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) + .unwrap(); + let column_desc = schema.column(0); + + // create the array reader + { + let mut data = Vec::new(); + let mut page_lists = Vec::new(); + make_column_chunks::( + column_desc.clone(), + Encoding::PLAIN, + 100, + -99999999, + 99999999, + &mut Vec::new(), + &mut Vec::new(), + &mut data, + &mut page_lists, + true, + 2, + ); + let page_iterator = + InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + + let mut array_reader = PrimitiveArrayReader::::new( + Box::new(page_iterator), + column_desc, + None, + ) + .unwrap(); + + // read data from the reader + // the data type is decimal(8,2) + let array = array_reader.next_batch(50).unwrap(); + assert_eq!(array.data_type(), &Decimal128(8, 2)); + let array = array.as_any().downcast_ref::().unwrap(); + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(8, 2).unwrap(); + assert_eq!(array, &data_decimal_array); + + // not equal with different data type(precision and scale) + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(9, 0).unwrap(); + assert_ne!(array, &data_decimal_array) + } + + // parquet `INT64` to decimal + let message_type = " + message test_schema { + REQUIRED INT64 decimal1 (DECIMAL(18,4)); + } + "; + let schema = parse_message_type(message_type) + .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t)))) + .unwrap(); + let column_desc = schema.column(0); + + // create the array reader + { + let mut data = Vec::new(); + let mut page_lists = Vec::new(); + make_column_chunks::( + column_desc.clone(), + Encoding::PLAIN, + 100, + -999999999999999999, + 999999999999999999, + &mut Vec::new(), + &mut Vec::new(), + &mut data, + &mut page_lists, + true, + 2, + ); + let page_iterator = + InMemoryPageIterator::new(schema, column_desc.clone(), page_lists); + + let mut array_reader = PrimitiveArrayReader::::new( + Box::new(page_iterator), + column_desc, + None, + ) + .unwrap(); + + // read data from the reader + // the data type is decimal(18,4) + let array = array_reader.next_batch(50).unwrap(); + assert_eq!(array.data_type(), &Decimal128(18, 4)); + let array = array.as_any().downcast_ref::().unwrap(); + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(18, 4).unwrap(); + assert_eq!(array, &data_decimal_array); + + // not equal with different data type(precision and scale) + let data_decimal_array = data[0..50].iter().copied().map(|v| Some(v as i128)).collect::().with_precision_and_scale(34, 0).unwrap(); + assert_ne!(array, &data_decimal_array) + } + } } diff --git a/parquet/src/arrow/array_reader/struct_array.rs b/parquet/src/arrow/array_reader/struct_array.rs index 602c598f8269..f682f146c721 100644 --- a/parquet/src/arrow/array_reader/struct_array.rs +++ b/parquet/src/arrow/array_reader/struct_array.rs @@ -63,7 +63,27 @@ impl ArrayReader for StructArrayReader { &self.data_type } - /// Read `batch_size` struct records. + fn read_records(&mut self, batch_size: usize) -> Result { + let mut read = None; + for child in self.children.iter_mut() { + let child_read = child.read_records(batch_size)?; + match read { + Some(expected) => { + if expected != child_read { + return Err(general_err!( + "StructArrayReader out of sync in read_records, expected {} skipped, got {}", + expected, + child_read + )); + } + } + None => read = Some(child_read), + } + } + Ok(read.unwrap_or(0)) + } + + /// Consume struct records. /// /// Definition levels of struct array is calculated as following: /// ```ignore @@ -80,7 +100,8 @@ impl ArrayReader for StructArrayReader { /// ```ignore /// null_bitmap[i] = (def_levels[i] >= self.def_level); /// ``` - fn next_batch(&mut self, batch_size: usize) -> Result { + /// + fn consume_batch(&mut self) -> Result { if self.children.is_empty() { return Ok(Arc::new(StructArray::from(Vec::new()))); } @@ -88,7 +109,7 @@ impl ArrayReader for StructArrayReader { let children_array = self .children .iter_mut() - .map(|reader| reader.next_batch(batch_size)) + .map(|reader| reader.consume_batch()) .collect::>>()?; // check that array child data has same size @@ -293,7 +314,6 @@ mod tests { let list_reader = ListArrayReader::::new( Box::new(reader), expected_l.data_type().clone(), - ArrowType::Int32, 3, 1, true, diff --git a/parquet/src/arrow/array_reader/test_util.rs b/parquet/src/arrow/array_reader/test_util.rs index 04c0f6c68f3f..ca1aabfd4aa1 100644 --- a/parquet/src/arrow/array_reader/test_util.rs +++ b/parquet/src/arrow/array_reader/test_util.rs @@ -48,8 +48,7 @@ pub fn utf8_column() -> ColumnDescPtr { /// Encode `data` with the provided `encoding` pub fn encode_byte_array(encoding: Encoding, data: &[ByteArray]) -> ByteBufferPtr { - let descriptor = utf8_column(); - let mut encoder = get_encoder::(descriptor, encoding).unwrap(); + let mut encoder = get_encoder::(encoding).unwrap(); encoder.put(data).unwrap(); encoder.flush_buffer().unwrap() @@ -101,6 +100,7 @@ pub struct InMemoryArrayReader { rep_levels: Option>, last_idx: usize, cur_idx: usize, + need_consume_records: usize, } impl InMemoryArrayReader { @@ -127,6 +127,7 @@ impl InMemoryArrayReader { rep_levels, cur_idx: 0, last_idx: 0, + need_consume_records: 0, } } } @@ -140,7 +141,7 @@ impl ArrayReader for InMemoryArrayReader { &self.data_type } - fn next_batch(&mut self, batch_size: usize) -> Result { + fn read_records(&mut self, batch_size: usize) -> Result { assert_ne!(batch_size, 0); // This replicates the logical normally performed by // RecordReader to delimit semantic records @@ -164,10 +165,17 @@ impl ArrayReader for InMemoryArrayReader { } None => batch_size.min(self.array.len() - self.cur_idx), }; + self.need_consume_records += read; + Ok(read) + } + fn consume_batch(&mut self) -> Result { + let batch_size = self.need_consume_records; + assert_ne!(batch_size, 0); self.last_idx = self.cur_idx; - self.cur_idx += read; - Ok(self.array.slice(self.last_idx, read)) + self.cur_idx += batch_size; + self.need_consume_records = 0; + Ok(self.array.slice(self.last_idx, batch_size)) } fn skip_records(&mut self, num_records: usize) -> Result { diff --git a/parquet/src/arrow/arrow_reader/filter.rs b/parquet/src/arrow/arrow_reader/filter.rs new file mode 100644 index 000000000000..8945ccde4248 --- /dev/null +++ b/parquet/src/arrow/arrow_reader/filter.rs @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::arrow::ProjectionMask; +use arrow::array::BooleanArray; +use arrow::error::Result as ArrowResult; +use arrow::record_batch::RecordBatch; + +/// A predicate operating on [`RecordBatch`] +pub trait ArrowPredicate: Send + 'static { + /// Returns the [`ProjectionMask`] that describes the columns required + /// to evaluate this predicate. All projected columns will be provided in the `batch` + /// passed to [`evaluate`](Self::evaluate) + fn projection(&self) -> &ProjectionMask; + + /// Evaluate this predicate for the given [`RecordBatch`] containing the columns + /// identified by [`Self::projection`] + /// + /// Rows that are `true` in the returned [`BooleanArray`] will be returned by the + /// parquet reader, whereas rows that are `false` or `Null` will not be + fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult; +} + +/// An [`ArrowPredicate`] created from an [`FnMut`] +pub struct ArrowPredicateFn { + f: F, + projection: ProjectionMask, +} + +impl ArrowPredicateFn +where + F: FnMut(RecordBatch) -> ArrowResult + Send + 'static, +{ + /// Create a new [`ArrowPredicateFn`]. `f` will be passed batches + /// that contains the columns specified in `projection` + /// and returns a [`BooleanArray`] that describes which rows should + /// be passed along + pub fn new(projection: ProjectionMask, f: F) -> Self { + Self { f, projection } + } +} + +impl ArrowPredicate for ArrowPredicateFn +where + F: FnMut(RecordBatch) -> ArrowResult + Send + 'static, +{ + fn projection(&self) -> &ProjectionMask { + &self.projection + } + + fn evaluate(&mut self, batch: RecordBatch) -> ArrowResult { + (self.f)(batch) + } +} + +/// A [`RowFilter`] allows pushing down a filter predicate to skip IO and decode +/// +/// This consists of a list of [`ArrowPredicate`] where only the rows that satisfy all +/// of the predicates will be returned. Any [`RowSelection`] will be applied prior +/// to the first predicate, and each predicate in turn will then be used to compute +/// a more refined [`RowSelection`] to use when evaluating the subsequent predicates. +/// +/// Once all predicates have been evaluated, the final [`RowSelection`] is applied +/// to the top-level [`ProjectionMask`] to produce the final output [`RecordBatch`]. +/// +/// This design has a couple of implications: +/// +/// * [`RowFilter`] can be used to skip entire pages, and thus IO, in addition to CPU decode overheads +/// * Columns may be decoded multiple times if they appear in multiple [`ProjectionMask`] +/// * IO will be deferred until needed by a [`ProjectionMask`] +/// +/// As such there is a trade-off between a single large predicate, or multiple predicates, +/// that will depend on the shape of the data. Whilst multiple smaller predicates may +/// minimise the amount of data scanned/decoded, it may not be faster overall. +/// +/// For example, if a predicate that needs a single column of data filters out all but +/// 1% of the rows, applying it as one of the early `ArrowPredicateFn` will likely significantly +/// improve performance. +/// +/// As a counter example, if a predicate needs several columns of data to evaluate but +/// leaves 99% of the rows, it may be better to not filter the data from parquet and +/// apply the filter after the RecordBatch has been fully decoded. +/// +/// [`RowSelection`]: [super::selection::RowSelection] +pub struct RowFilter { + /// A list of [`ArrowPredicate`] + pub(crate) predicates: Vec>, +} + +impl RowFilter { + /// Create a new [`RowFilter`] from an array of [`ArrowPredicate`] + pub fn new(predicates: Vec>) -> Self { + Self { predicates } + } +} diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader/mod.rs similarity index 74% rename from parquet/src/arrow/arrow_reader.rs rename to parquet/src/arrow/arrow_reader/mod.rs index 19985818d174..e363919f6516 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -21,6 +21,7 @@ use std::collections::VecDeque; use std::sync::Arc; use arrow::array::Array; +use arrow::compute::prep_null_mask_filter; use arrow::datatypes::{DataType as ArrowType, Schema, SchemaRef}; use arrow::error::Result as ArrowResult; use arrow::record_batch::{RecordBatch, RecordBatchReader}; @@ -36,6 +37,17 @@ use crate::file::reader::{ChunkReader, FileReader, SerializedFileReader}; use crate::file::serialized_reader::ReadOptionsBuilder; use crate::schema::types::SchemaDescriptor; +#[allow(unused)] +mod filter; +#[allow(unused)] +mod selection; + +// TODO: Make these public once stable (#1792) +#[allow(unused_imports)] +pub(crate) use filter::{ArrowPredicate, ArrowPredicateFn, RowFilter}; +#[allow(unused_imports)] +pub(crate) use selection::{RowSelection, RowSelector}; + /// Arrow reader api. /// With this api, user can get arrow schema from parquet file, and read parquet data /// into arrow arrays. @@ -72,44 +84,15 @@ pub trait ArrowReader { ) -> Result; } -/// [`RowSelection`] allows selecting or skipping a provided number of rows -/// when scanning the parquet file -#[derive(Debug, Clone, Copy)] -pub(crate) struct RowSelection { - /// The number of rows - pub row_count: usize, - - /// If true, skip `row_count` rows - pub skip: bool, -} - -impl RowSelection { - /// Select `row_count` rows - pub fn select(row_count: usize) -> Self { - Self { - row_count, - skip: false, - } - } - - /// Skip `row_count` rows - pub fn skip(row_count: usize) -> Self { - Self { - row_count, - skip: true, - } - } -} - #[derive(Debug, Clone, Default)] pub struct ArrowReaderOptions { skip_arrow_metadata: bool, - selection: Option>, + selection: Option, } impl ArrowReaderOptions { /// Create a new [`ArrowReaderOptions`] with the default settings - fn new() -> Self { + pub fn new() -> Self { Self::default() } @@ -128,11 +111,9 @@ impl ArrowReaderOptions { /// Scan rows from the parquet file according to the provided `selection` /// - /// TODO: Make public once row selection fully implemented (#1792) - pub(crate) fn with_row_selection( - self, - selection: impl Into>, - ) -> Self { + /// TODO: Revisit this API, as [`Self`] is provided before the file metadata is available + #[allow(unused)] + pub(crate) fn with_row_selection(self, selection: impl Into) -> Self { Self { selection: Some(selection.into()), ..self @@ -140,6 +121,9 @@ impl ArrowReaderOptions { } } +/// An `ArrowReader` that can be used to synchronously read parquet data as [`RecordBatch`] +/// +/// See [`crate::arrow::async_reader`] for an asynchronous interface pub struct ParquetFileArrowReader { file_reader: Arc, @@ -175,21 +159,13 @@ impl ArrowReader for ParquetFileArrowReader { mask: ProjectionMask, batch_size: usize, ) -> Result { - let array_reader = build_array_reader( - self.file_reader - .metadata() - .file_metadata() - .schema_descr_ptr(), - Arc::new(self.get_schema()?), - mask, - Box::new(self.file_reader.clone()), - )?; + let array_reader = + build_array_reader(Arc::new(self.get_schema()?), mask, &self.file_reader)?; - let selection = self.options.selection.clone().map(Into::into); Ok(ParquetRecordBatchReader::new( batch_size, array_reader, - selection, + self.options.selection.clone(), )) } } @@ -276,54 +252,68 @@ impl ParquetFileArrowReader { } } +/// An `Iterator>` that yields [`RecordBatch`] +/// read from a parquet data source pub struct ParquetRecordBatchReader { batch_size: usize, array_reader: Box, schema: SchemaRef, - selection: Option>, + selection: Option>, } impl Iterator for ParquetRecordBatchReader { type Item = ArrowResult; fn next(&mut self) -> Option { - let to_read = match self.selection.as_mut() { - Some(selection) => loop { - let front = selection.pop_front()?; - if front.skip { - let skipped = match self.array_reader.skip_records(front.row_count) { - Ok(skipped) => skipped, - Err(e) => return Some(Err(e.into())), - }; - - if skipped != front.row_count { - return Some(Err(general_err!( - "failed to skip rows, expected {}, got {}", - front.row_count, - skipped - ) - .into())); + let mut read_records = 0; + match self.selection.as_mut() { + Some(selection) => { + while read_records < self.batch_size && !selection.is_empty() { + let front = selection.pop_front().unwrap(); + if front.skip { + let skipped = + match self.array_reader.skip_records(front.row_count) { + Ok(skipped) => skipped, + Err(e) => return Some(Err(e.into())), + }; + + if skipped != front.row_count { + return Some(Err(general_err!( + "failed to skip rows, expected {}, got {}", + front.row_count, + skipped + ) + .into())); + } + continue; } - continue; - } - // try to read record - let to_read = match front.row_count.checked_sub(self.batch_size) { - Some(remaining) if remaining != 0 => { - // if page row count less than batch_size we must set batch size to page row count. - // add check avoid dead loop - selection.push_front(RowSelection::select(remaining)); - self.batch_size + // try to read record + let need_read = self.batch_size - read_records; + let to_read = match front.row_count.checked_sub(need_read) { + Some(remaining) if remaining != 0 => { + // if page row count less than batch_size we must set batch size to page row count. + // add check avoid dead loop + selection.push_front(RowSelector::select(remaining)); + need_read + } + _ => front.row_count, + }; + match self.array_reader.read_records(to_read) { + Ok(0) => break, + Ok(rec) => read_records += rec, + Err(error) => return Some(Err(error.into())), } - _ => front.row_count, - }; - - break to_read; - }, - None => self.batch_size, + } + } + None => { + if let Err(error) = self.array_reader.read_records(self.batch_size) { + return Some(Err(error.into())); + } + } }; - match self.array_reader.next_batch(to_read) { + match self.array_reader.consume_batch() { Err(error) => Some(Err(error.into())), Ok(array) => { let struct_array = @@ -349,22 +339,13 @@ impl RecordBatchReader for ParquetRecordBatchReader { } impl ParquetRecordBatchReader { - pub fn try_new( - batch_size: usize, - array_reader: Box, - ) -> Result { - Ok(Self::new(batch_size, array_reader, None)) - } - /// Create a new [`ParquetRecordBatchReader`] that will read at most `batch_size` rows at /// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None` /// all rows will be returned - /// - /// TODO: Make public once row selection fully implemented (#1792) pub(crate) fn new( batch_size: usize, array_reader: Box, - selection: Option>, + selection: Option, ) -> Self { let schema = match array_reader.get_data_type() { ArrowType::Struct(ref fields) => Schema::new(fields.clone()), @@ -375,11 +356,41 @@ impl ParquetRecordBatchReader { batch_size, array_reader, schema: Arc::new(schema), - selection, + selection: selection.map(Into::into), } } } +/// Evaluates an [`ArrowPredicate`] returning the [`RowSelection`] +/// +/// If this [`ParquetRecordBatchReader`] has a [`RowSelection`], the +/// returned [`RowSelection`] will be the conjunction of this and +/// the rows selected by `predicate` +#[allow(unused)] +pub(crate) fn evaluate_predicate( + batch_size: usize, + array_reader: Box, + input_selection: Option, + predicate: &mut dyn ArrowPredicate, +) -> Result { + let reader = + ParquetRecordBatchReader::new(batch_size, array_reader, input_selection.clone()); + let mut filters = vec![]; + for maybe_batch in reader { + let filter = predicate.evaluate(maybe_batch?)?; + match filter.null_count() { + 0 => filters.push(filter), + _ => filters.push(prep_null_mask_filter(&filter)), + }; + } + + let raw = RowSelection::from_filters(&filters); + Ok(match input_selection { + Some(selection) => selection.and_then(&raw), + None => raw, + }) +} + #[cfg(test)] mod tests { use bytes::Bytes; @@ -391,23 +402,21 @@ mod tests { use std::path::PathBuf; use std::sync::Arc; - use rand::{thread_rng, RngCore}; - use serde_json::json; - use serde_json::Value::{Array as JArray, Null as JNull, Object as JObject}; + use rand::{thread_rng, Rng, RngCore}; use tempfile::tempfile; use arrow::array::*; + use arrow::buffer::Buffer; use arrow::datatypes::{DataType as ArrowDataType, Field, Schema}; use arrow::error::Result as ArrowResult; use arrow::record_batch::{RecordBatch, RecordBatchReader}; use crate::arrow::arrow_reader::{ ArrowReader, ArrowReaderOptions, ParquetFileArrowReader, - ParquetRecordBatchReader, RowSelection, + ParquetRecordBatchReader, RowSelection, RowSelector, }; use crate::arrow::buffer::converter::{ - BinaryArrayConverter, Converter, FixedSizeArrayConverter, FromConverter, - IntervalDayTimeArrayConverter, LargeUtf8ArrayConverter, Utf8ArrayConverter, + Converter, FixedSizeArrayConverter, IntervalDayTimeArrayConverter, }; use crate::arrow::schema::add_encoded_arrow_schema_to_metadata; use crate::arrow::{ArrowWriter, ProjectionMask}; @@ -422,54 +431,35 @@ mod tests { use crate::file::writer::SerializedFileWriter; use crate::schema::parser::parse_message_type; use crate::schema::types::{Type, TypePtr}; - use crate::util::test_common::RandGen; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_arrow_reader_all_columns() { - let json_values = get_json_array("parquet/generated_simple_numerics/blogs.json"); - let parquet_file_reader = get_test_reader("parquet/generated_simple_numerics/blogs.parquet"); - let max_len = parquet_file_reader.metadata().file_metadata().num_rows() as usize; - let mut arrow_reader = ParquetFileArrowReader::new(parquet_file_reader); - let mut record_batch_reader = arrow_reader + let record_batch_reader = arrow_reader .get_record_reader(60) .expect("Failed to read into array!"); // Verify that the schema was correctly parsed let original_schema = arrow_reader.get_schema().unwrap().fields().clone(); assert_eq!(original_schema, *record_batch_reader.schema().fields()); - - compare_batch_json(&mut record_batch_reader, json_values, max_len); } #[test] fn test_arrow_reader_single_column() { - let json_values = get_json_array("parquet/generated_simple_numerics/blogs.json"); - - let projected_json_values = json_values - .into_iter() - .map(|value| match value { - JObject(fields) => { - json!({ "blog_id": fields.get("blog_id").unwrap_or(&JNull).clone()}) - } - _ => panic!("Input should be json object array!"), - }) - .collect::>(); - let parquet_file_reader = get_test_reader("parquet/generated_simple_numerics/blogs.parquet"); let file_metadata = parquet_file_reader.metadata().file_metadata(); - let max_len = file_metadata.num_rows() as usize; let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [2]); let mut arrow_reader = ParquetFileArrowReader::new(parquet_file_reader); - let mut record_batch_reader = arrow_reader + let record_batch_reader = arrow_reader .get_record_reader_by_columns(mask, 60) .expect("Failed to read into array!"); @@ -477,8 +467,6 @@ mod tests { let original_schema = arrow_reader.get_schema().unwrap().fields().clone(); assert_eq!(1, record_batch_reader.schema().fields().len()); assert_eq!(original_schema[1], record_batch_reader.schema().fields()[0]); - - compare_batch_json(&mut record_batch_reader, projected_json_values, max_len); } #[test] @@ -524,29 +512,29 @@ mod tests { #[test] fn test_primitive_single_column_reader_test() { - run_single_column_reader_tests::( + run_single_column_reader_tests::( 2, ConvertedType::NONE, None, - &FromConverter::new(), + |vals| Arc::new(BooleanArray::from_iter(vals.iter().cloned())), &[Encoding::PLAIN, Encoding::RLE, Encoding::RLE_DICTIONARY], ); - run_single_column_reader_tests::( + run_single_column_reader_tests::( 2, ConvertedType::NONE, None, - &FromConverter::new(), + |vals| Arc::new(Int32Array::from_iter(vals.iter().cloned())), &[ Encoding::PLAIN, Encoding::RLE_DICTIONARY, Encoding::DELTA_BINARY_PACKED, ], ); - run_single_column_reader_tests::( + run_single_column_reader_tests::( 2, ConvertedType::NONE, None, - &FromConverter::new(), + |vals| Arc::new(Int64Array::from_iter(vals.iter().cloned())), &[ Encoding::PLAIN, Encoding::RLE_DICTIONARY, @@ -568,16 +556,11 @@ mod tests { #[test] fn test_fixed_length_binary_column_reader() { let converter = FixedSizeArrayConverter::new(20); - run_single_column_reader_tests::< - FixedLenByteArrayType, - FixedSizeBinaryArray, - FixedSizeArrayConverter, - RandFixedLenGen, - >( + run_single_column_reader_tests::( 20, ConvertedType::NONE, None, - &converter, + |vals| Arc::new(converter.convert(vals.to_vec()).unwrap()), &[Encoding::PLAIN, Encoding::RLE_DICTIONARY], ); } @@ -585,16 +568,11 @@ mod tests { #[test] fn test_interval_day_time_column_reader() { let converter = IntervalDayTimeArrayConverter {}; - run_single_column_reader_tests::< - FixedLenByteArrayType, - IntervalDayTimeArray, - IntervalDayTimeArrayConverter, - RandFixedLenGen, - >( + run_single_column_reader_tests::( 12, ConvertedType::INTERVAL, None, - &converter, + |vals| Arc::new(converter.convert(vals.to_vec()).unwrap()), &[Encoding::PLAIN, Encoding::RLE_DICTIONARY], ); } @@ -609,6 +587,12 @@ mod tests { #[test] fn test_utf8_single_column_reader_test() { + fn string_converter(vals: &[Option]) -> ArrayRef { + Arc::new(GenericStringArray::::from_iter(vals.iter().map(|x| { + x.as_ref().map(|b| std::str::from_utf8(b.data()).unwrap()) + }))) + } + let encodings = &[ Encoding::PLAIN, Encoding::RLE_DICTIONARY, @@ -616,46 +600,39 @@ mod tests { Encoding::DELTA_BYTE_ARRAY, ]; - let converter = BinaryArrayConverter {}; - run_single_column_reader_tests::< - ByteArrayType, - BinaryArray, - BinaryArrayConverter, - RandUtf8Gen, - >(2, ConvertedType::NONE, None, &converter, encodings); - - let utf8_converter = Utf8ArrayConverter {}; - run_single_column_reader_tests::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >(2, ConvertedType::UTF8, None, &utf8_converter, encodings); - - run_single_column_reader_tests::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >( + run_single_column_reader_tests::( + 2, + ConvertedType::NONE, + None, + |vals| { + Arc::new(BinaryArray::from_iter( + vals.iter().map(|x| x.as_ref().map(|x| x.data())), + )) + }, + encodings, + ); + + run_single_column_reader_tests::( + 2, + ConvertedType::UTF8, + None, + string_converter::, + encodings, + ); + + run_single_column_reader_tests::( 2, ConvertedType::UTF8, Some(ArrowDataType::Utf8), - &utf8_converter, + string_converter::, encodings, ); - let large_utf8_converter = LargeUtf8ArrayConverter {}; - run_single_column_reader_tests::< - ByteArrayType, - LargeStringArray, - LargeUtf8ArrayConverter, - RandUtf8Gen, - >( + run_single_column_reader_tests::( 2, ConvertedType::UTF8, Some(ArrowDataType::LargeUtf8), - &large_utf8_converter, + string_converter::, encodings, ); @@ -665,21 +642,21 @@ mod tests { let mut opts = TestOptions::new(2, 20, 15).with_null_percent(50); opts.encoding = *encoding; + let data_type = ArrowDataType::Dictionary( + Box::new(key.clone()), + Box::new(ArrowDataType::Utf8), + ); + // Cannot run full test suite as keys overflow, run small test instead - single_column_reader_test::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >( + single_column_reader_test::( opts, 2, ConvertedType::UTF8, - Some(ArrowDataType::Dictionary( - Box::new(key.clone()), - Box::new(ArrowDataType::Utf8), - )), - &utf8_converter, + Some(data_type.clone()), + move |vals| { + let vals = string_converter::(vals); + arrow::compute::cast(&vals, &data_type).unwrap() + }, ); } } @@ -694,41 +671,160 @@ mod tests { ]; for key in &key_types { - run_single_column_reader_tests::< - ByteArrayType, - StringArray, - Utf8ArrayConverter, - RandUtf8Gen, - >( + let data_type = ArrowDataType::Dictionary( + Box::new(key.clone()), + Box::new(ArrowDataType::Utf8), + ); + + run_single_column_reader_tests::( 2, ConvertedType::UTF8, - Some(ArrowDataType::Dictionary( - Box::new(key.clone()), - Box::new(ArrowDataType::Utf8), - )), - &utf8_converter, + Some(data_type.clone()), + move |vals| { + let vals = string_converter::(vals); + arrow::compute::cast(&vals, &data_type).unwrap() + }, encodings, ); // https://github.com/apache/arrow-rs/issues/1179 - // run_single_column_reader_tests::< - // ByteArrayType, - // LargeStringArray, - // LargeUtf8ArrayConverter, - // RandUtf8Gen, - // >( + // let data_type = ArrowDataType::Dictionary( + // Box::new(key.clone()), + // Box::new(ArrowDataType::LargeUtf8), + // ); + // + // run_single_column_reader_tests::( // 2, // ConvertedType::UTF8, - // Some(ArrowDataType::Dictionary( - // Box::new(key.clone()), - // Box::new(ArrowDataType::LargeUtf8), - // )), - // &large_utf8_converter, - // encodings + // Some(data_type.clone()), + // move |vals| { + // let vals = string_converter::(vals); + // arrow::compute::cast(&vals, &data_type).unwrap() + // }, + // encodings, // ); } } + #[test] + fn test_decimal_nullable_struct() { + let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); + + let data = ArrayDataBuilder::new(ArrowDataType::Struct(vec![Field::new( + "decimals", + decimals.data_type().clone(), + false, + )])) + .len(8) + .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) + .child_data(vec![decimals.into_data()]) + .build() + .unwrap(); + + let written = RecordBatch::try_from_iter([( + "struct", + Arc::new(StructArray::from(data)) as ArrayRef, + )]) + .unwrap(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = + ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + writer.write(&written).unwrap(); + writer.close().unwrap(); + + let read = ParquetFileArrowReader::try_new(Bytes::from(buffer)) + .unwrap() + .get_record_reader(3) + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(&written.slice(0, 3), &read[0]); + assert_eq!(&written.slice(3, 3), &read[1]); + assert_eq!(&written.slice(6, 2), &read[2]); + } + + #[test] + fn test_int32_nullable_struct() { + let int32 = Int32Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); + let data = ArrayDataBuilder::new(ArrowDataType::Struct(vec![Field::new( + "int32", + int32.data_type().clone(), + false, + )])) + .len(8) + .null_bit_buffer(Some(Buffer::from(&[0b11101111]))) + .child_data(vec![int32.into_data()]) + .build() + .unwrap(); + + let written = RecordBatch::try_from_iter([( + "struct", + Arc::new(StructArray::from(data)) as ArrayRef, + )]) + .unwrap(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = + ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + writer.write(&written).unwrap(); + writer.close().unwrap(); + + let read = ParquetFileArrowReader::try_new(Bytes::from(buffer)) + .unwrap() + .get_record_reader(3) + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(&written.slice(0, 3), &read[0]); + assert_eq!(&written.slice(3, 3), &read[1]); + assert_eq!(&written.slice(6, 2), &read[2]); + } + + #[test] + #[ignore] // https://github.com/apache/arrow-rs/issues/2253 + fn test_decimal_list() { + let decimals = Decimal128Array::from_iter_values([1, 2, 3, 4, 5, 6, 7, 8]); + + // [[], [1], [2, 3], null, [4], null, [6, 7, 8]] + let data = ArrayDataBuilder::new(ArrowDataType::List(Box::new(Field::new( + "item", + decimals.data_type().clone(), + false, + )))) + .len(7) + .add_buffer(Buffer::from_iter([0_i32, 0, 1, 3, 3, 4, 5, 8])) + .null_bit_buffer(Some(Buffer::from(&[0b01010111]))) + .child_data(vec![decimals.into_data()]) + .build() + .unwrap(); + + let written = RecordBatch::try_from_iter([( + "list", + Arc::new(ListArray::from(data)) as ArrayRef, + )]) + .unwrap(); + + let mut buffer = Vec::with_capacity(1024); + let mut writer = + ArrowWriter::try_new(&mut buffer, written.schema(), None).unwrap(); + writer.write(&written).unwrap(); + writer.close().unwrap(); + + let read = ParquetFileArrowReader::try_new(Bytes::from(buffer)) + .unwrap() + .get_record_reader(3) + .unwrap() + .collect::>>() + .unwrap(); + + assert_eq!(&written.slice(0, 3), &read[0]); + assert_eq!(&written.slice(3, 3), &read[1]); + assert_eq!(&written.slice(6, 1), &read[2]); + } + #[test] fn test_read_decimal_file() { use arrow::array::Decimal128Array; @@ -792,6 +888,8 @@ mod tests { enabled_statistics: EnabledStatistics, /// Encoding encoding: Encoding, + //row selections and total selected row count + row_selections: Option<(RowSelection, usize)>, } impl Default for TestOptions { @@ -807,6 +905,7 @@ mod tests { writer_version: WriterVersion::PARQUET_1_0, enabled_statistics: EnabledStatistics::Page, encoding: Encoding::PLAIN, + row_selections: None, } } } @@ -849,6 +948,20 @@ mod tests { } } + fn with_row_selections(self) -> Self { + let mut rng = thread_rng(); + let step = rng.gen_range(self.record_batch_size..self.num_rows); + let row_selections = create_test_selection( + step, + self.num_row_groups * self.num_rows, + rng.gen::(), + ); + Self { + row_selections: Some(row_selections), + ..self + } + } + fn writer_props(&self) -> WriterProperties { let builder = WriterProperties::builder() .set_data_pagesize_limit(self.max_data_page_size) @@ -875,19 +988,18 @@ mod tests { /// /// `rand_max` represents the maximum size of value to pass to to /// value generator - fn run_single_column_reader_tests( + fn run_single_column_reader_tests( rand_max: i32, converted_type: ConvertedType, arrow_type: Option, - converter: &C, + converter: F, encodings: &[Encoding], ) where T: DataType, G: RandGen, - A: Array + 'static, - C: Converter>, A> + 'static, + F: Fn(&[Option]) -> ArrayRef, { - let all_options = vec![ + let mut all_options = vec![ // choose record_batch_batch (15) so batches cross row // group boundaries (50 rows in 2 row groups) cases. TestOptions::new(2, 100, 15), @@ -918,6 +1030,39 @@ mod tests { .with_enabled_statistics(EnabledStatistics::None), ]; + let skip_options = vec![ + // choose record_batch_batch (15) so batches cross row + // group boundaries (50 rows in 2 row groups) cases. + TestOptions::new(2, 100, 15).with_row_selections(), + // choose record_batch_batch (5) so batches sometime fall + // on row group boundaries and (25 rows in 3 row groups + // --> row groups of 10, 10, and 5). Tests buffer + // refilling edge cases. + TestOptions::new(3, 25, 5).with_row_selections(), + // Choose record_batch_size (25) so all batches fall + // exactly on row group boundary (25). Tests buffer + // refilling edge cases. + TestOptions::new(4, 100, 25).with_row_selections(), + // Set maximum page size so row groups have multiple pages + TestOptions::new(3, 256, 73) + .with_max_data_page_size(128) + .with_row_selections(), + // Set small dictionary page size to test dictionary fallback + TestOptions::new(3, 256, 57) + .with_max_dict_page_size(128) + .with_row_selections(), + // Test optional but with no nulls + TestOptions::new(2, 256, 127) + .with_null_percent(0) + .with_row_selections(), + // Test optional with nulls + TestOptions::new(2, 256, 93) + .with_null_percent(25) + .with_row_selections(), + ]; + + all_options.extend(skip_options); + all_options.into_iter().for_each(|opts| { for writer_version in [WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { @@ -925,15 +1070,15 @@ mod tests { let opts = TestOptions { writer_version, encoding: *encoding, - ..opts + ..opts.clone() }; - single_column_reader_test::( + single_column_reader_test::( opts, rand_max, converted_type, arrow_type.clone(), - converter, + &converter, ) } } @@ -943,24 +1088,24 @@ mod tests { /// Create a parquet file and then read it using /// `ParquetFileArrowReader` using the parameters described in /// `opts`. - fn single_column_reader_test( + fn single_column_reader_test( opts: TestOptions, rand_max: i32, converted_type: ConvertedType, arrow_type: Option, - converter: &C, + converter: F, ) where T: DataType, G: RandGen, - A: Array + 'static, - C: Converter>, A> + 'static, + F: Fn(&[Option]) -> ArrayRef, { // Print out options to facilitate debugging failures on CI println!( - "Running single_column_reader_test ConvertedType::{}/ArrowType::{:?} with Options: {:?}", - converted_type, arrow_type, opts + "Running type {:?} single_column_reader_test ConvertedType::{}/ArrowType::{:?} with Options: {:?}", + T::get_physical_type(), converted_type, arrow_type, opts ); + //according to null_percent generate def_levels let (repetition, def_levels) = match opts.null_percent.as_ref() { Some(null_percent) => { let mut rng = thread_rng(); @@ -979,6 +1124,7 @@ mod tests { None => (Repetition::REQUIRED, None), }; + //generate random table data let values: Vec> = (0..opts.num_row_groups) .map(|idx| { let null_count = match def_levels.as_ref() { @@ -1011,9 +1157,7 @@ mod tests { .unwrap(), ); - let arrow_field = arrow_type - .clone() - .map(|t| arrow::datatypes::Field::new("leaf", t, false)); + let arrow_field = arrow_type.map(|t| Field::new("leaf", t, false)); let mut file = tempfile::tempfile().unwrap(); @@ -1029,29 +1173,37 @@ mod tests { file.rewind().unwrap(); - let mut arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); + let mut arrow_reader; + let expected_data: Vec>; + if let Some((selections, row_count)) = opts.row_selections.clone() { + let options = + ArrowReaderOptions::new().with_row_selection(selections.clone()); + arrow_reader = + ParquetFileArrowReader::try_new_with_options(file, options).unwrap(); + let mut without_skip_data = gen_expected_data::(&def_levels, &values); + + let mut skip_data: Vec> = vec![]; + let selections: VecDeque = selections.into(); + for select in selections { + if select.skip { + without_skip_data.drain(0..select.row_count); + } else { + skip_data.extend(without_skip_data.drain(0..select.row_count)); + } + } + expected_data = skip_data; + assert_eq!(expected_data.len(), row_count); + } else { + arrow_reader = ParquetFileArrowReader::try_new(file).unwrap(); + //get flatten table data + expected_data = gen_expected_data::(&def_levels, &values); + assert_eq!(expected_data.len(), opts.num_rows * opts.num_row_groups); + } + let mut record_reader = arrow_reader .get_record_reader(opts.record_batch_size) .unwrap(); - let expected_data: Vec> = match def_levels { - Some(levels) => { - let mut values_iter = values.iter().flatten(); - levels - .iter() - .flatten() - .map(|d| match d { - 1 => Some(values_iter.next().cloned().unwrap()), - 0 => None, - _ => unreachable!(), - }) - .collect() - } - None => values.iter().flatten().map(|b| Some(b.clone())).collect(), - }; - - assert_eq!(expected_data.len(), opts.num_rows * opts.num_row_groups); - let mut total_read = 0; loop { let maybe_batch = record_reader.next(); @@ -1060,19 +1212,9 @@ mod tests { let batch = maybe_batch.unwrap().unwrap(); assert_eq!(end - total_read, batch.num_rows()); - let mut data = vec![]; - data.extend_from_slice(&expected_data[total_read..end]); + let a = converter(&expected_data[total_read..end]); + let b = Arc::clone(batch.column(0)); - let a = converter.convert(data).unwrap(); - let mut b = Arc::clone(batch.column(0)); - - if let Some(arrow_type) = arrow_type.as_ref() { - assert_eq!(b.data_type(), arrow_type); - if let ArrowDataType::Dictionary(_, v) = arrow_type { - assert_eq!(a.data_type(), v.as_ref()); - b = arrow::compute::cast(&b, v.as_ref()).unwrap() - } - } assert_eq!(a.data_type(), b.data_type()); assert_eq!(a.data(), b.data(), "{:#?} vs {:#?}", a.data(), b.data()); @@ -1084,17 +1226,39 @@ mod tests { } } + fn gen_expected_data( + def_levels: &Option>>, + values: &[Vec], + ) -> Vec> { + let data: Vec> = match def_levels { + Some(levels) => { + let mut values_iter = values.iter().flatten(); + levels + .iter() + .flatten() + .map(|d| match d { + 1 => Some(values_iter.next().cloned().unwrap()), + 0 => None, + _ => unreachable!(), + }) + .collect() + } + None => values.iter().flatten().map(|b| Some(b.clone())).collect(), + }; + data + } + fn generate_single_column_file_with_data( values: &[Vec], def_levels: Option<&Vec>>, file: File, schema: TypePtr, - field: Option, + field: Option, opts: &TestOptions, ) -> Result { let mut writer_props = opts.writer_props(); if let Some(field) = field { - let arrow_schema = arrow::datatypes::Schema::new(vec![field]); + let arrow_schema = Schema::new(vec![field]); add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut writer_props); } @@ -1137,39 +1301,6 @@ mod tests { File::open(path.as_path()).expect("File not found!") } - fn get_json_array(filename: &str) -> Vec { - match serde_json::from_reader(get_test_file(filename)) - .expect("Failed to read json value from file!") - { - JArray(values) => values, - _ => panic!("Input should be json array!"), - } - } - - fn compare_batch_json( - record_batch_reader: &mut dyn RecordBatchReader, - json_values: Vec, - max_len: usize, - ) { - for i in 0..20 { - let array: Option = record_batch_reader - .next() - .map(|r| r.expect("Failed to read record batch!").into()); - - let (start, end) = (i * 60_usize, (i + 1) * 60_usize); - - if start < max_len { - assert!(array.is_some()); - assert_ne!(0, array.as_ref().unwrap().len()); - let end = min(end, max_len); - let json = JArray(Vec::from(&json_values[start..end])); - assert_eq!(array.unwrap(), json) - } else { - assert!(array.is_none()); - } - } - } - #[test] fn test_read_structs() { // This particular test file has columns of struct types where there is @@ -1629,12 +1760,12 @@ mod tests { /// a `batch_size` and `selection` fn get_expected_batches( column: &RecordBatch, - selection: &[RowSelection], + selection: &RowSelection, batch_size: usize, ) -> Vec { let mut expected_batches = vec![]; - let mut selection: VecDeque<_> = selection.iter().cloned().collect(); + let mut selection: VecDeque<_> = selection.clone().into(); let mut row_offset = 0; let mut last_start = None; while row_offset < column.num_rows() && !selection.is_empty() { @@ -1682,6 +1813,34 @@ mod tests { expected_batches } + fn create_test_selection( + step_len: usize, + total_len: usize, + skip_first: bool, + ) -> (RowSelection, usize) { + let mut remaining = total_len; + let mut skip = skip_first; + let mut vec = vec![]; + let mut selected_count = 0; + while remaining != 0 { + let step = if remaining > step_len { + step_len + } else { + remaining + }; + vec.push(RowSelector { + row_count: step, + skip, + }); + remaining -= step; + if !skip { + selected_count += step; + } + skip = !skip; + } + (vec.into(), selected_count) + } + #[test] fn test_scan_row_with_selection() { let testdata = arrow::util::test_util::parquet_test_data(); @@ -1696,7 +1855,7 @@ mod tests { let do_test = |batch_size: usize, selection_len: usize| { for skip_first in [false, true] { let selections = - create_test_selection(batch_size, data.num_rows(), skip_first); + create_test_selection(batch_size, data.num_rows(), skip_first).0; let expected = get_expected_batches(&data, &selections, batch_size); let skip_reader = create_skip_reader(&test_file, batch_size, selections); @@ -1728,7 +1887,7 @@ mod tests { fn create_skip_reader( test_file: &File, batch_size: usize, - selections: Vec, + selections: RowSelection, ) -> ParquetRecordBatchReader { let arrow_reader_options = ArrowReaderOptions::new().with_row_selection(selections); @@ -1740,29 +1899,5 @@ mod tests { .unwrap(); skip_arrow_reader.get_record_reader(batch_size).unwrap() } - - fn create_test_selection( - step_len: usize, - total_len: usize, - skip_first: bool, - ) -> Vec { - let mut remaining = total_len; - let mut skip = skip_first; - let mut vec = vec![]; - while remaining != 0 { - let step = if remaining > step_len { - step_len - } else { - remaining - }; - vec.push(RowSelection { - row_count: step, - skip, - }); - remaining -= step; - skip = !skip; - } - vec - } } } diff --git a/parquet/src/arrow/arrow_reader/selection.rs b/parquet/src/arrow/arrow_reader/selection.rs new file mode 100644 index 000000000000..8e129f5667ec --- /dev/null +++ b/parquet/src/arrow/arrow_reader/selection.rs @@ -0,0 +1,426 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{Array, BooleanArray}; +use arrow::compute::SlicesIterator; +use std::cmp::Ordering; +use std::collections::VecDeque; +use std::ops::Range; + +/// [`RowSelector`] represents a range of rows to scan from a parquet file +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RowSelector { + /// The number of rows + pub row_count: usize, + + /// If true, skip `row_count` rows + pub skip: bool, +} + +impl RowSelector { + /// Select `row_count` rows + pub fn select(row_count: usize) -> Self { + Self { + row_count, + skip: false, + } + } + + /// Skip `row_count` rows + pub fn skip(row_count: usize) -> Self { + Self { + row_count, + skip: true, + } + } +} + +/// [`RowSelection`] allows selecting or skipping a provided number of rows +/// when scanning the parquet file. +/// +/// This is applied prior to reading column data, and can therefore +/// be used to skip IO to fetch data into memory +/// +/// A typical use-case would be using the [`PageIndex`] to filter out rows +/// that don't satisfy a predicate +/// +/// [`PageIndex`]: [crate::file::page_index::index::PageIndex] +#[derive(Debug, Clone, Default, Eq, PartialEq)] +pub struct RowSelection { + selectors: Vec, +} + +impl RowSelection { + /// Creates a [`RowSelection`] from a slice of [`BooleanArray`] + /// + /// # Panic + /// + /// Panics if any of the [`BooleanArray`] contain nulls + pub fn from_filters(filters: &[BooleanArray]) -> Self { + let mut next_offset = 0; + let total_rows = filters.iter().map(|x| x.len()).sum(); + + let iter = filters.iter().flat_map(|filter| { + let offset = next_offset; + next_offset += filter.len(); + assert_eq!(filter.null_count(), 0); + SlicesIterator::new(filter) + .map(move |(start, end)| start + offset..end + offset) + }); + + Self::from_consecutive_ranges(iter, total_rows) + } + + /// Creates a [`RowSelection`] from an iterator of consecutive ranges to keep + fn from_consecutive_ranges>>( + ranges: I, + total_rows: usize, + ) -> Self { + let mut selectors: Vec = Vec::with_capacity(ranges.size_hint().0); + let mut last_end = 0; + for range in ranges { + let len = range.end - range.start; + + match range.start.cmp(&last_end) { + Ordering::Equal => match selectors.last_mut() { + Some(last) => last.row_count += len, + None => selectors.push(RowSelector::select(len)), + }, + Ordering::Greater => { + selectors.push(RowSelector::skip(range.start - last_end)); + selectors.push(RowSelector::select(len)) + } + Ordering::Less => panic!("out of order"), + } + last_end = range.end; + } + + if last_end != total_rows { + selectors.push(RowSelector::skip(total_rows - last_end)) + } + + Self { selectors } + } + + /// Splits off the first `row_count` from this [`RowSelection`] + pub fn split_off(&mut self, row_count: usize) -> Self { + let mut total_count = 0; + + // Find the index where the selector exceeds the row count + let find = self.selectors.iter().enumerate().find(|(_, selector)| { + total_count += selector.row_count; + total_count > row_count + }); + + let split_idx = match find { + Some((idx, _)) => idx, + None => { + let selectors = std::mem::take(&mut self.selectors); + return Self { selectors }; + } + }; + + let mut remaining = self.selectors.split_off(split_idx); + + // Always present as `split_idx < self.selectors.len` + let next = remaining.first_mut().unwrap(); + let overflow = total_count - row_count; + + if next.row_count != overflow { + self.selectors.push(RowSelector { + row_count: next.row_count - overflow, + skip: next.skip, + }) + } + next.row_count = overflow; + + std::mem::swap(&mut remaining, &mut self.selectors); + Self { + selectors: remaining, + } + } + + /// Given a [`RowSelection`] computed under `self`, returns the [`RowSelection`] + /// representing their conjunction + /// + /// For example: + /// + /// self: NNNNNNNNNNNNYYYYYYYYYYYYYYYYYYYYYYNNNYYYYY + /// other: YYYYYNNNNYYYYYYYYYYYYY YYNNN + /// + /// returned: NNNNNNNNNNNNYYYYYNNNNYYYYYYYYYYYYYYNNYNNNN + /// + /// + pub fn and_then(&self, other: &Self) -> Self { + let mut selectors = vec![]; + let mut first = self.selectors.iter().cloned().peekable(); + let mut second = other.selectors.iter().cloned().peekable(); + + let mut to_skip = 0; + while let Some(b) = second.peek_mut() { + let a = first.peek_mut().unwrap(); + + if b.row_count == 0 { + second.next().unwrap(); + continue; + } + + if a.row_count == 0 { + first.next().unwrap(); + continue; + } + + if a.skip { + // Records were skipped when producing second + to_skip += a.row_count; + first.next().unwrap(); + continue; + } + + let skip = b.skip; + let to_process = a.row_count.min(b.row_count); + + a.row_count -= to_process; + b.row_count -= to_process; + + match skip { + true => to_skip += to_process, + false => { + if to_skip != 0 { + selectors.push(RowSelector::skip(to_skip)); + to_skip = 0; + } + selectors.push(RowSelector::select(to_process)) + } + } + } + + for v in first { + if v.row_count != 0 { + assert!(v.skip); + to_skip += v.row_count + } + } + + if to_skip != 0 { + selectors.push(RowSelector::skip(to_skip)); + } + + Self { selectors } + } + + /// Returns `true` if this [`RowSelection`] selects any rows + pub fn selects_any(&self) -> bool { + self.selectors.iter().any(|x| !x.skip) + } +} + +impl From> for RowSelection { + fn from(selectors: Vec) -> Self { + Self { selectors } + } +} + +impl From for VecDeque { + fn from(r: RowSelection) -> Self { + r.selectors.into() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rand::{thread_rng, Rng}; + + #[test] + fn test_from_filters() { + let filters = vec![ + BooleanArray::from(vec![false, false, false, true, true, true, true]), + BooleanArray::from(vec![true, true, false, false, true, true, true]), + BooleanArray::from(vec![false, false, false, false]), + BooleanArray::from(Vec::::new()), + ]; + + let selection = RowSelection::from_filters(&filters[..1]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(3), RowSelector::select(4)] + ); + + let selection = RowSelection::from_filters(&filters[..2]); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3) + ] + ); + + let selection = RowSelection::from_filters(&filters); + assert!(selection.selects_any()); + assert_eq!( + selection.selectors, + vec![ + RowSelector::skip(3), + RowSelector::select(6), + RowSelector::skip(2), + RowSelector::select(3), + RowSelector::skip(4) + ] + ); + + let selection = RowSelection::from_filters(&filters[2..3]); + assert!(!selection.selects_any()); + assert_eq!(selection.selectors, vec![RowSelector::skip(4)]); + } + + #[test] + fn test_split_off() { + let mut selection = RowSelection::from(vec![ + RowSelector::skip(34), + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35), + ]); + + let split = selection.split_off(34); + assert_eq!(split.selectors, vec![RowSelector::skip(34)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(12), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(5); + assert_eq!(split.selectors, vec![RowSelector::select(5)]); + assert_eq!( + selection.selectors, + vec![ + RowSelector::select(7), + RowSelector::skip(3), + RowSelector::select(35) + ] + ); + + let split = selection.split_off(8); + assert_eq!( + split.selectors, + vec![RowSelector::select(7), RowSelector::skip(1)] + ); + assert_eq!( + selection.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + + let split = selection.split_off(200); + assert_eq!( + split.selectors, + vec![RowSelector::skip(2), RowSelector::select(35)] + ); + assert!(selection.selectors.is_empty()); + } + + #[test] + fn test_and() { + let mut a = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(23), + RowSelector::skip(3), + RowSelector::select(5), + ]); + + let b = RowSelection::from(vec![ + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(15), + RowSelector::skip(4), + ]); + + let mut expected = RowSelection::from(vec![ + RowSelector::skip(12), + RowSelector::select(5), + RowSelector::skip(4), + RowSelector::select(14), + RowSelector::skip(3), + RowSelector::select(1), + RowSelector::skip(4), + ]); + + assert_eq!(a.and_then(&b), expected); + + a.split_off(7); + expected.split_off(7); + assert_eq!(a.and_then(&b), expected); + + let a = RowSelection::from(vec![RowSelector::select(5), RowSelector::skip(3)]); + + let b = RowSelection::from(vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(1), + ]); + + assert_eq!( + a.and_then(&b).selectors, + vec![ + RowSelector::select(2), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(4) + ] + ); + } + + #[test] + fn test_and_fuzz() { + let mut rand = thread_rng(); + for _ in 0..100 { + let a_len = rand.gen_range(10..100); + let a_bools: Vec<_> = (0..a_len).map(|x| rand.gen_bool(0.2)).collect(); + let a = RowSelection::from_filters(&[BooleanArray::from(a_bools.clone())]); + + let b_len: usize = a_bools.iter().map(|x| *x as usize).sum(); + let b_bools: Vec<_> = (0..b_len).map(|x| rand.gen_bool(0.8)).collect(); + let b = RowSelection::from_filters(&[BooleanArray::from(b_bools.clone())]); + + let mut expected_bools = vec![false; a_len]; + + let mut iter_b = b_bools.iter(); + for (idx, b) in a_bools.iter().enumerate() { + if *b && *iter_b.next().unwrap() { + expected_bools[idx] = true; + } + } + + let expected = + RowSelection::from_filters(&[BooleanArray::from(expected_bools)]); + + let total_rows: usize = expected.selectors.iter().map(|s| s.row_count).sum(); + assert_eq!(a_len, total_rows); + + assert_eq!(a.and_then(&b), expected); + } + } +} diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs new file mode 100644 index 000000000000..a7b6ccc3fc85 --- /dev/null +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -0,0 +1,587 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::arrow::arrow_writer::levels::LevelInfo; +use crate::basic::Encoding; +use crate::column::page::PageWriter; +use crate::column::writer::encoder::{ + ColumnValueEncoder, DataPageValues, DictionaryPage, +}; +use crate::column::writer::GenericColumnWriter; +use crate::data_type::{AsBytes, ByteArray, Int32Type}; +use crate::encodings::encoding::{DeltaBitPackEncoder, Encoder}; +use crate::encodings::rle::RleEncoder; +use crate::errors::{ParquetError, Result}; +use crate::file::properties::{WriterProperties, WriterPropertiesPtr, WriterVersion}; +use crate::file::writer::OnCloseColumnChunk; +use crate::schema::types::ColumnDescPtr; +use crate::util::bit_util::num_required_bits; +use crate::util::interner::{Interner, Storage}; +use arrow::array::{ + Array, ArrayAccessor, ArrayRef, BinaryArray, DictionaryArray, LargeBinaryArray, + LargeStringArray, StringArray, +}; +use arrow::datatypes::DataType; + +macro_rules! downcast_dict_impl { + ($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{ + $op($array + .as_any() + .downcast_ref::>() + .unwrap() + .downcast_dict::<$val>() + .unwrap()$(, $arg)*) + }}; +} + +macro_rules! downcast_dict_op { + ($key_type:expr, $val:ident, $array:ident, $op:expr $(, $arg:expr)*) => { + match $key_type.as_ref() { + DataType::UInt8 => downcast_dict_impl!($array, UInt8Type, $val, $op$(, $arg)*), + DataType::UInt16 => downcast_dict_impl!($array, UInt16Type, $val, $op$(, $arg)*), + DataType::UInt32 => downcast_dict_impl!($array, UInt32Type, $val, $op$(, $arg)*), + DataType::UInt64 => downcast_dict_impl!($array, UInt64Type, $val, $op$(, $arg)*), + DataType::Int8 => downcast_dict_impl!($array, Int8Type, $val, $op$(, $arg)*), + DataType::Int16 => downcast_dict_impl!($array, Int16Type, $val, $op$(, $arg)*), + DataType::Int32 => downcast_dict_impl!($array, Int32Type, $val, $op$(, $arg)*), + DataType::Int64 => downcast_dict_impl!($array, Int64Type, $val, $op$(, $arg)*), + _ => unreachable!(), + } + }; +} + +macro_rules! downcast_op { + ($data_type:expr, $array:ident, $op:expr $(, $arg:expr)*) => { + match $data_type { + DataType::Utf8 => $op($array.as_any().downcast_ref::().unwrap()$(, $arg)*), + DataType::LargeUtf8 => { + $op($array.as_any().downcast_ref::().unwrap()$(, $arg)*) + } + DataType::Binary => { + $op($array.as_any().downcast_ref::().unwrap()$(, $arg)*) + } + DataType::LargeBinary => { + $op($array.as_any().downcast_ref::().unwrap()$(, $arg)*) + } + DataType::Dictionary(key, value) => match value.as_ref() { + DataType::Utf8 => downcast_dict_op!(key, StringArray, $array, $op$(, $arg)*), + DataType::LargeUtf8 => { + downcast_dict_op!(key, LargeStringArray, $array, $op$(, $arg)*) + } + DataType::Binary => downcast_dict_op!(key, BinaryArray, $array, $op$(, $arg)*), + DataType::LargeBinary => { + downcast_dict_op!(key, LargeBinaryArray, $array, $op$(, $arg)*) + } + d => unreachable!("cannot downcast {} dictionary value to byte array", d), + }, + d => unreachable!("cannot downcast {} to byte array", d), + } + }; +} + +/// A writer for byte array types +pub(super) struct ByteArrayWriter<'a> { + writer: GenericColumnWriter<'a, ByteArrayEncoder>, + on_close: Option>, +} + +impl<'a> ByteArrayWriter<'a> { + /// Returns a new [`ByteArrayWriter`] + pub fn new( + descr: ColumnDescPtr, + props: &'a WriterPropertiesPtr, + page_writer: Box, + on_close: OnCloseColumnChunk<'a>, + ) -> Result { + Ok(Self { + writer: GenericColumnWriter::new(descr, props.clone(), page_writer), + on_close: Some(on_close), + }) + } + + pub fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { + self.writer.write_batch_internal( + array, + Some(levels.non_null_indices()), + levels.def_levels(), + levels.rep_levels(), + None, + None, + None, + )?; + Ok(()) + } + + pub fn close(self) -> Result<()> { + let (bytes_written, rows_written, metadata, column_index, offset_index) = + self.writer.close()?; + + if let Some(on_close) = self.on_close { + on_close( + bytes_written, + rows_written, + metadata, + column_index, + offset_index, + )?; + } + Ok(()) + } +} + +/// A fallback encoder, i.e. non-dictionary, for [`ByteArray`] +struct FallbackEncoder { + encoder: FallbackEncoderImpl, + num_values: usize, +} + +/// The fallback encoder in use +/// +/// Note: DeltaBitPackEncoder is boxed as it is rather large +enum FallbackEncoderImpl { + Plain { + buffer: Vec, + }, + DeltaLength { + buffer: Vec, + lengths: Box>, + }, + Delta { + buffer: Vec, + last_value: Vec, + prefix_lengths: Box>, + suffix_lengths: Box>, + }, +} + +impl FallbackEncoder { + /// Create the fallback encoder for the given [`ColumnDescPtr`] and [`WriterProperties`] + fn new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result { + // Set either main encoder or fallback encoder. + let encoding = props.encoding(descr.path()).unwrap_or_else(|| { + match props.writer_version() { + WriterVersion::PARQUET_1_0 => Encoding::PLAIN, + WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY, + } + }); + + let encoder = match encoding { + Encoding::PLAIN => FallbackEncoderImpl::Plain { buffer: vec![] }, + Encoding::DELTA_LENGTH_BYTE_ARRAY => FallbackEncoderImpl::DeltaLength { + buffer: vec![], + lengths: Box::new(DeltaBitPackEncoder::new()), + }, + Encoding::DELTA_BYTE_ARRAY => FallbackEncoderImpl::Delta { + buffer: vec![], + last_value: vec![], + prefix_lengths: Box::new(DeltaBitPackEncoder::new()), + suffix_lengths: Box::new(DeltaBitPackEncoder::new()), + }, + _ => { + return Err(general_err!( + "unsupported encoding {} for byte array", + encoding + )) + } + }; + + Ok(Self { + encoder, + num_values: 0, + }) + } + + /// Encode `values` to the in-progress page + fn encode(&mut self, values: T, indices: &[usize]) + where + T: ArrayAccessor + Copy, + T::Item: AsRef<[u8]>, + { + self.num_values += indices.len(); + match &mut self.encoder { + FallbackEncoderImpl::Plain { buffer } => { + for idx in indices { + let value = values.value(*idx); + let value = value.as_ref(); + buffer.extend_from_slice((value.len() as u32).as_bytes()); + buffer.extend_from_slice(value) + } + } + FallbackEncoderImpl::DeltaLength { buffer, lengths } => { + for idx in indices { + let value = values.value(*idx); + let value = value.as_ref(); + lengths.put(&[value.len() as i32]).unwrap(); + buffer.extend_from_slice(value); + } + } + FallbackEncoderImpl::Delta { + buffer, + last_value, + prefix_lengths, + suffix_lengths, + } => { + for idx in indices { + let value = values.value(*idx); + let value = value.as_ref(); + let mut prefix_length = 0; + + while prefix_length < last_value.len() + && prefix_length < value.len() + && last_value[prefix_length] == value[prefix_length] + { + prefix_length += 1; + } + + let suffix_length = value.len() - prefix_length; + + last_value.clear(); + last_value.extend_from_slice(value); + + buffer.extend_from_slice(&value[prefix_length..]); + prefix_lengths.put(&[prefix_length as i32]).unwrap(); + suffix_lengths.put(&[suffix_length as i32]).unwrap(); + } + } + } + } + + fn estimated_data_page_size(&self) -> usize { + match &self.encoder { + FallbackEncoderImpl::Plain { buffer, .. } => buffer.len(), + FallbackEncoderImpl::DeltaLength { buffer, lengths } => { + buffer.len() + lengths.estimated_data_encoded_size() + } + FallbackEncoderImpl::Delta { + buffer, + prefix_lengths, + suffix_lengths, + .. + } => { + buffer.len() + + prefix_lengths.estimated_data_encoded_size() + + suffix_lengths.estimated_data_encoded_size() + } + } + } + + fn flush_data_page( + &mut self, + min_value: Option, + max_value: Option, + ) -> Result> { + let (buf, encoding) = match &mut self.encoder { + FallbackEncoderImpl::Plain { buffer } => { + (std::mem::take(buffer), Encoding::PLAIN) + } + FallbackEncoderImpl::DeltaLength { buffer, lengths } => { + let lengths = lengths.flush_buffer()?; + + let mut out = Vec::with_capacity(lengths.len() + buffer.len()); + out.extend_from_slice(lengths.data()); + out.extend_from_slice(buffer); + (out, Encoding::DELTA_LENGTH_BYTE_ARRAY) + } + FallbackEncoderImpl::Delta { + buffer, + prefix_lengths, + suffix_lengths, + .. + } => { + let prefix_lengths = prefix_lengths.flush_buffer()?; + let suffix_lengths = suffix_lengths.flush_buffer()?; + + let mut out = Vec::with_capacity( + prefix_lengths.len() + suffix_lengths.len() + buffer.len(), + ); + out.extend_from_slice(prefix_lengths.data()); + out.extend_from_slice(suffix_lengths.data()); + out.extend_from_slice(buffer); + (out, Encoding::DELTA_BYTE_ARRAY) + } + }; + + Ok(DataPageValues { + buf: buf.into(), + num_values: std::mem::take(&mut self.num_values), + encoding, + min_value, + max_value, + }) + } +} + +/// [`Storage`] for the [`Interner`] used by [`DictEncoder`] +#[derive(Debug, Default)] +struct ByteArrayStorage { + /// Encoded dictionary data + page: Vec, + + values: Vec>, +} + +impl Storage for ByteArrayStorage { + type Key = u64; + type Value = [u8]; + + fn get(&self, idx: Self::Key) -> &Self::Value { + &self.page[self.values[idx as usize].clone()] + } + + fn push(&mut self, value: &Self::Value) -> Self::Key { + let key = self.values.len(); + + self.page.reserve(4 + value.len()); + self.page.extend_from_slice((value.len() as u32).as_bytes()); + + let start = self.page.len(); + self.page.extend_from_slice(value); + self.values.push(start..self.page.len()); + + key as u64 + } +} + +/// A dictionary encoder for byte array data +#[derive(Debug, Default)] +struct DictEncoder { + interner: Interner, + indices: Vec, +} + +impl DictEncoder { + /// Encode `values` to the in-progress page + fn encode(&mut self, values: T, indices: &[usize]) + where + T: ArrayAccessor + Copy, + T::Item: AsRef<[u8]>, + { + self.indices.reserve(indices.len()); + + for idx in indices { + let value = values.value(*idx); + let interned = self.interner.intern(value.as_ref()); + self.indices.push(interned); + } + } + + fn bit_width(&self) -> u8 { + let length = self.interner.storage().values.len(); + num_required_bits(length.saturating_sub(1) as u64) + } + + fn estimated_data_page_size(&self) -> usize { + let bit_width = self.bit_width(); + 1 + RleEncoder::min_buffer_size(bit_width) + + RleEncoder::max_buffer_size(bit_width, self.indices.len()) + } + + fn estimated_dict_page_size(&self) -> usize { + self.interner.storage().page.len() + } + + fn flush_dict_page(self) -> DictionaryPage { + let storage = self.interner.into_inner(); + + DictionaryPage { + buf: storage.page.into(), + num_values: storage.values.len(), + is_sorted: false, + } + } + + fn flush_data_page( + &mut self, + min_value: Option, + max_value: Option, + ) -> DataPageValues { + let num_values = self.indices.len(); + let buffer_len = self.estimated_data_page_size(); + let mut buffer = Vec::with_capacity(buffer_len); + buffer.push(self.bit_width() as u8); + + let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer); + for index in &self.indices { + encoder.put(*index as u64) + } + + self.indices.clear(); + + DataPageValues { + buf: encoder.consume().into(), + num_values, + encoding: Encoding::RLE_DICTIONARY, + min_value, + max_value, + } + } +} + +struct ByteArrayEncoder { + fallback: FallbackEncoder, + dict_encoder: Option, + num_values: usize, + min_value: Option, + max_value: Option, +} + +impl ColumnValueEncoder for ByteArrayEncoder { + type T = ByteArray; + type Values = ArrayRef; + + fn min_max( + &self, + values: &ArrayRef, + value_indices: Option<&[usize]>, + ) -> Option<(Self::T, Self::T)> { + match value_indices { + Some(indices) => { + let iter = indices.iter().cloned(); + downcast_op!(values.data_type(), values, compute_min_max, iter) + } + None => { + let len = Array::len(values); + downcast_op!(values.data_type(), values, compute_min_max, 0..len) + } + } + } + + fn try_new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result + where + Self: Sized, + { + let dictionary = props + .dictionary_enabled(descr.path()) + .then(DictEncoder::default); + + let fallback = FallbackEncoder::new(descr, props)?; + + Ok(Self { + fallback, + dict_encoder: dictionary, + num_values: 0, + min_value: None, + max_value: None, + }) + } + + fn write( + &mut self, + _values: &Self::Values, + _offset: usize, + _len: usize, + ) -> Result<()> { + unreachable!("should call write_gather instead") + } + + fn write_gather(&mut self, values: &Self::Values, indices: &[usize]) -> Result<()> { + downcast_op!(values.data_type(), values, encode, indices, self); + Ok(()) + } + + fn num_values(&self) -> usize { + self.num_values + } + + fn has_dictionary(&self) -> bool { + self.dict_encoder.is_some() + } + + fn estimated_dict_page_size(&self) -> Option { + Some(self.dict_encoder.as_ref()?.estimated_dict_page_size()) + } + + fn estimated_data_page_size(&self) -> usize { + match &self.dict_encoder { + Some(encoder) => encoder.estimated_data_page_size(), + None => self.fallback.estimated_data_page_size(), + } + } + + fn flush_dict_page(&mut self) -> Result> { + match self.dict_encoder.take() { + Some(encoder) => { + if self.num_values != 0 { + return Err(general_err!( + "Must flush data pages before flushing dictionary" + )); + } + + Ok(Some(encoder.flush_dict_page())) + } + _ => Ok(None), + } + } + + fn flush_data_page(&mut self) -> Result> { + let min_value = self.min_value.take(); + let max_value = self.max_value.take(); + + match &mut self.dict_encoder { + Some(encoder) => Ok(encoder.flush_data_page(min_value, max_value)), + _ => self.fallback.flush_data_page(min_value, max_value), + } + } +} + +/// Encodes the provided `values` and `indices` to `encoder` +/// +/// This is a free function so it can be used with `downcast_op!` +fn encode(values: T, indices: &[usize], encoder: &mut ByteArrayEncoder) +where + T: ArrayAccessor + Copy, + T::Item: Copy + Ord + AsRef<[u8]>, +{ + if let Some((min, max)) = compute_min_max(values, indices.iter().cloned()) { + if encoder.min_value.as_ref().map_or(true, |m| m > &min) { + encoder.min_value = Some(min); + } + + if encoder.max_value.as_ref().map_or(true, |m| m < &max) { + encoder.max_value = Some(max); + } + } + + match &mut encoder.dict_encoder { + Some(dict_encoder) => dict_encoder.encode(values, indices), + None => encoder.fallback.encode(values, indices), + } +} + +/// Computes the min and max for the provided array and indices +/// +/// This is a free function so it can be used with `downcast_op!` +fn compute_min_max( + array: T, + mut valid: impl Iterator, +) -> Option<(ByteArray, ByteArray)> +where + T: ArrayAccessor, + T::Item: Copy + Ord + AsRef<[u8]>, +{ + let first_idx = valid.next()?; + + let first_val = array.value(first_idx); + let mut min = first_val; + let mut max = first_val; + for idx in valid { + let val = array.value(idx); + min = min.min(val); + max = max.max(val); + } + Some((min.as_ref().to_vec().into(), max.as_ref().to_vec().into())) +} diff --git a/parquet/src/arrow/arrow_writer/levels.rs b/parquet/src/arrow/arrow_writer/levels.rs index f88d986ea9e9..9a90d40d5a85 100644 --- a/parquet/src/arrow/arrow_writer/levels.rs +++ b/parquet/src/arrow/arrow_writer/levels.rs @@ -88,7 +88,7 @@ fn is_leaf(data_type: &DataType) -> bool { | DataType::Interval(_) | DataType::Binary | DataType::LargeBinary - | DataType::Decimal(_, _) + | DataType::Decimal128(_, _) | DataType::FixedSizeBinary(_) ) } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 8a79a116f548..08f37c395658 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -23,7 +23,6 @@ use std::sync::Arc; use arrow::array as arrow_array; use arrow::array::ArrayRef; -use arrow::array::BasicDecimalArray; use arrow::datatypes::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::Array; @@ -33,54 +32,18 @@ use super::schema::{ decimal_length_from_precision, }; -use crate::column::writer::{get_column_writer, ColumnWriter}; +use crate::arrow::arrow_writer::byte_array::ByteArrayWriter; +use crate::column::writer::{ColumnWriter, ColumnWriterImpl}; use crate::errors::{ParquetError, Result}; use crate::file::metadata::RowGroupMetaDataPtr; use crate::file::properties::WriterProperties; -use crate::file::writer::{SerializedColumnWriter, SerializedRowGroupWriter}; +use crate::file::writer::SerializedRowGroupWriter; use crate::{data_type::*, file::writer::SerializedFileWriter}; use levels::{calculate_array_levels, LevelInfo}; +mod byte_array; mod levels; -/// An object-safe API for writing an [`ArrayRef`] -trait ArrayWriter { - fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()>; - - fn close(&mut self) -> Result<()>; -} - -/// Fallback implementation for writing an [`ArrayRef`] that uses [`SerializedColumnWriter`] -struct ColumnArrayWriter<'a>(Option>); - -impl<'a> ArrayWriter for ColumnArrayWriter<'a> { - fn write(&mut self, array: &ArrayRef, levels: LevelInfo) -> Result<()> { - write_leaf(self.0.as_mut().unwrap().untyped(), array, levels)?; - Ok(()) - } - - fn close(&mut self) -> Result<()> { - self.0.take().unwrap().close() - } -} - -fn get_writer<'a, W: Write>( - row_group_writer: &'a mut SerializedRowGroupWriter<'_, W>, -) -> Result> { - let array_writer = row_group_writer - .next_column_with_factory(|descr, props, page_writer, on_close| { - // TODO: Special case array readers (#1764) - - let column_writer = get_column_writer(descr, props.clone(), page_writer); - let serialized_writer = - SerializedColumnWriter::new(column_writer, Some(on_close)); - - Ok(Box::new(ColumnArrayWriter(Some(serialized_writer)))) - })? - .expect("Unable to get column writer"); - Ok(array_writer) -} - /// Arrow writer /// /// Writes Arrow `RecordBatch`es to a Parquet writer, buffering up `RecordBatch` in order @@ -298,22 +261,24 @@ fn write_leaves( | ArrowDataType::Time64(_) | ArrowDataType::Duration(_) | ArrowDataType::Interval(_) - | ArrowDataType::LargeBinary - | ArrowDataType::Binary - | ArrowDataType::Utf8 - | ArrowDataType::LargeUtf8 - | ArrowDataType::Decimal(_, _) + | ArrowDataType::Decimal128(_, _) | ArrowDataType::Decimal256(_, _) | ArrowDataType::FixedSizeBinary(_) => { - let mut writer = get_writer(row_group_writer)?; + let mut col_writer = row_group_writer.next_column()?.unwrap(); for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - writer.write( - array, - levels.pop().expect("Levels exhausted"), - )?; + write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; } - writer.close()?; - Ok(()) + col_writer.close() + } + ArrowDataType::LargeBinary + | ArrowDataType::Binary + | ArrowDataType::Utf8 + | ArrowDataType::LargeUtf8 => { + let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); + for (array, levels) in arrays.iter().zip(levels.iter_mut()) { + col_writer.write(array, levels.pop().expect("Levels exhausted"))?; + } + col_writer.close() } ArrowDataType::List(_) | ArrowDataType::LargeList(_) => { let arrays: Vec<_> = arrays.iter().map(|array|{ @@ -364,18 +329,21 @@ fn write_leaves( write_leaves(row_group_writer, &values, levels)?; Ok(()) } - ArrowDataType::Dictionary(_, value_type) => { - let mut writer = get_writer(row_group_writer)?; - for (array, levels) in arrays.iter().zip(levels.iter_mut()) { - // cast dictionary to a primitive - let array = arrow::compute::cast(array, value_type)?; - writer.write( - &array, - levels.pop().expect("Levels exhausted"), - )?; + ArrowDataType::Dictionary(_, value_type) => match value_type.as_ref() { + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Binary | ArrowDataType::LargeBinary => { + let mut col_writer = row_group_writer.next_column_with_factory(ByteArrayWriter::new)?.unwrap(); + for (array, levels) in arrays.iter().zip(levels.iter_mut()) { + col_writer.write(array, levels.pop().expect("Levels exhausted"))?; + } + col_writer.close() + } + _ => { + let mut col_writer = row_group_writer.next_column()?.unwrap(); + for (array, levels) in arrays.iter().zip(levels.iter_mut()) { + write_leaf(col_writer.untyped(), array, levels.pop().expect("Levels exhausted"))?; + } + col_writer.close() } - writer.close()?; - Ok(()) } ArrowDataType::Float16 => Err(ParquetError::ArrowError( "Float16 arrays not supported".to_string(), @@ -399,33 +367,25 @@ fn write_leaf( let indices = levels.non_null_indices(); let written = match writer { ColumnWriter::Int32ColumnWriter(ref mut typed) => { - let values = match column.data_type() { + match column.data_type() { ArrowDataType::Date64 => { // If the column is a Date64, we cast it to a Date32, and then interpret that as Int32 - let array = if let ArrowDataType::Date64 = column.data_type() { - let array = arrow::compute::cast(column, &ArrowDataType::Date32)?; - arrow::compute::cast(&array, &ArrowDataType::Int32)? - } else { - arrow::compute::cast(column, &ArrowDataType::Int32)? - }; + let array = arrow::compute::cast(column, &ArrowDataType::Date32)?; + let array = arrow::compute::cast(&array, &ArrowDataType::Int32)?; + let array = array .as_any() .downcast_ref::() .expect("Unable to get int32 array"); - get_numeric_array_slice::(array, indices) + write_primitive(typed, array.values(), levels)? } ArrowDataType::UInt32 => { + let data = column.data(); + let offset = data.offset(); // follow C++ implementation and use overflow/reinterpret cast from u32 to i32 which will map // `(i32::MAX as u32)..u32::MAX` to `i32::MIN..0` - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get u32 array"); - let array = arrow::compute::unary::<_, _, arrow::datatypes::Int32Type>( - array, - |x| x as i32, - ); - get_numeric_array_slice::(&array, indices) + let array: &[i32] = data.buffers()[0].typed_data(); + write_primitive(typed, &array[offset..offset + data.len()], levels)? } _ => { let array = arrow::compute::cast(column, &ArrowDataType::Int32)?; @@ -433,14 +393,9 @@ fn write_leaf( .as_any() .downcast_ref::() .expect("Unable to get i32 array"); - get_numeric_array_slice::(array, indices) + write_primitive(typed, array.values(), levels)? } - }; - typed.write_batch( - values.as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? + } } ColumnWriter::BoolColumnWriter(ref mut typed) => { let array = column @@ -454,26 +409,21 @@ fn write_leaf( )? } ColumnWriter::Int64ColumnWriter(ref mut typed) => { - let values = match column.data_type() { + match column.data_type() { ArrowDataType::Int64 => { let array = column .as_any() .downcast_ref::() .expect("Unable to get i64 array"); - get_numeric_array_slice::(array, indices) + write_primitive(typed, array.values(), levels)? } ArrowDataType::UInt64 => { // follow C++ implementation and use overflow/reinterpret cast from u64 to i64 which will map // `(i64::MAX as u64)..u64::MAX` to `i64::MIN..0` - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get u64 array"); - let array = arrow::compute::unary::<_, _, arrow::datatypes::Int64Type>( - array, - |x| x as i64, - ); - get_numeric_array_slice::(&array, indices) + let data = column.data(); + let offset = data.offset(); + let array: &[i64] = data.buffers()[0].typed_data(); + write_primitive(typed, &array[offset..offset + data.len()], levels)? } _ => { let array = arrow::compute::cast(column, &ArrowDataType::Int64)?; @@ -481,14 +431,9 @@ fn write_leaf( .as_any() .downcast_ref::() .expect("Unable to get i64 array"); - get_numeric_array_slice::(array, indices) + write_primitive(typed, array.values(), levels)? } - }; - typed.write_batch( - values.as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? + } } ColumnWriter::Int96ColumnWriter(ref mut _typed) => { unreachable!("Currently unreachable because data type not supported") @@ -498,70 +443,18 @@ fn write_leaf( .as_any() .downcast_ref::() .expect("Unable to get Float32 array"); - typed.write_batch( - get_numeric_array_slice::(array, indices).as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? + write_primitive(typed, array.values(), levels)? } ColumnWriter::DoubleColumnWriter(ref mut typed) => { let array = column .as_any() .downcast_ref::() .expect("Unable to get Float64 array"); - typed.write_batch( - get_numeric_array_slice::(array, indices).as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? + write_primitive(typed, array.values(), levels)? + } + ColumnWriter::ByteArrayColumnWriter(_) => { + unreachable!("should use ByteArrayWriter") } - ColumnWriter::ByteArrayColumnWriter(ref mut typed) => match column.data_type() { - ArrowDataType::Binary => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get BinaryArray array"); - typed.write_batch( - get_binary_array(array).as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? - } - ArrowDataType::Utf8 => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get LargeBinaryArray array"); - typed.write_batch( - get_string_array(array).as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? - } - ArrowDataType::LargeBinary => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get LargeBinaryArray array"); - typed.write_batch( - get_large_binary_array(array).as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? - } - ArrowDataType::LargeUtf8 => { - let array = column - .as_any() - .downcast_ref::() - .expect("Unable to get LargeUtf8 array"); - typed.write_batch( - get_large_string_array(array).as_slice(), - levels.def_levels(), - levels.rep_levels(), - )? - } - _ => unreachable!("Currently unreachable because data type not supported"), - }, ColumnWriter::FixedLenByteArrayColumnWriter(ref mut typed) => { let bytes = match column.data_type() { ArrowDataType::Interval(interval_unit) => match interval_unit { @@ -595,7 +488,7 @@ fn write_leaf( .unwrap(); get_fsb_array_slice(array, indices) } - ArrowDataType::Decimal(_, _) => { + ArrowDataType::Decimal128(_, _) => { let array = column .as_any() .downcast_ref::() @@ -619,55 +512,20 @@ fn write_leaf( Ok(written as i64) } -macro_rules! def_get_binary_array_fn { - ($name:ident, $ty:ty) => { - fn $name(array: &$ty) -> Vec { - let mut byte_array = ByteArray::new(); - let ptr = crate::util::memory::ByteBufferPtr::new( - array.value_data().as_slice().to_vec(), - ); - byte_array.set_data(ptr); - array - .value_offsets() - .windows(2) - .enumerate() - .filter_map(|(i, offsets)| { - if array.is_valid(i) { - let start = offsets[0] as usize; - let len = offsets[1] as usize - start; - Some(byte_array.slice(start, len)) - } else { - None - } - }) - .collect() - } - }; -} - -// TODO: These methods don't handle non null indices correctly (#1753) -def_get_binary_array_fn!(get_binary_array, arrow_array::BinaryArray); -def_get_binary_array_fn!(get_string_array, arrow_array::StringArray); -def_get_binary_array_fn!(get_large_binary_array, arrow_array::LargeBinaryArray); -def_get_binary_array_fn!(get_large_string_array, arrow_array::LargeStringArray); - -/// Get the underlying numeric array slice, skipping any null values. -/// If there are no null values, it might be quicker to get the slice directly instead of -/// calling this function. -fn get_numeric_array_slice( - array: &arrow_array::PrimitiveArray, - indices: &[usize], -) -> Vec -where - T: DataType, - A: arrow::datatypes::ArrowNumericType, - T::T: From, -{ - let mut values = Vec::with_capacity(indices.len()); - for i in indices { - values.push(array.value(*i).into()) - } - values +fn write_primitive<'a, T: DataType>( + writer: &mut ColumnWriterImpl<'a, T>, + values: &[T::T], + levels: LevelInfo, +) -> Result { + writer.write_batch_internal( + values, + Some(levels.non_null_indices()), + levels.def_levels(), + levels.rep_levels(), + None, + None, + None, + ) } fn get_bool_array_slice( @@ -756,7 +614,9 @@ mod tests { use arrow::{array::*, buffer::Buffer}; use crate::arrow::{ArrowReader, ParquetFileArrowReader}; + use crate::basic::Encoding; use crate::file::metadata::ParquetMetaData; + use crate::file::properties::WriterVersion; use crate::file::{ reader::{FileReader, SerializedFileReader}, statistics::Statistics, @@ -952,7 +812,7 @@ mod tests { #[test] fn arrow_writer_decimal() { - let decimal_field = Field::new("a", DataType::Decimal(5, 2), false); + let decimal_field = Field::new("a", DataType::Decimal128(5, 2), false); let schema = Schema::new(vec![decimal_field]); let decimal_values = vec![10_000, 50_000, 0, -100] @@ -1226,20 +1086,34 @@ mod tests { const SMALL_SIZE: usize = 7; - fn roundtrip(expected_batch: RecordBatch, max_row_group_size: Option) -> File { + fn roundtrip( + expected_batch: RecordBatch, + max_row_group_size: Option, + ) -> Vec { + let mut files = vec![]; + for version in [WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { + let mut props = WriterProperties::builder().set_writer_version(version); + + if let Some(size) = max_row_group_size { + props = props.set_max_row_group_size(size) + } + + let props = props.build(); + files.push(roundtrip_opts(&expected_batch, props)) + } + files + } + + fn roundtrip_opts(expected_batch: &RecordBatch, props: WriterProperties) -> File { let file = tempfile::tempfile().unwrap(); let mut writer = ArrowWriter::try_new( file.try_clone().unwrap(), expected_batch.schema(), - max_row_group_size.map(|size| { - WriterProperties::builder() - .set_max_row_group_size(size) - .build() - }), + Some(props), ) .expect("Unable to write file"); - writer.write(&expected_batch).unwrap(); + writer.write(expected_batch).unwrap(); writer.close().unwrap(); let mut arrow_reader = @@ -1264,20 +1138,59 @@ mod tests { file } - fn one_column_roundtrip( - values: ArrayRef, - nullable: bool, - max_row_group_size: Option, - ) -> File { - let schema = Schema::new(vec![Field::new( - "col", - values.data_type().clone(), - nullable, - )]); - let expected_batch = - RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); + fn one_column_roundtrip(values: ArrayRef, nullable: bool) -> Vec { + let data_type = values.data_type().clone(); + let schema = Schema::new(vec![Field::new("col", data_type, nullable)]); + one_column_roundtrip_with_schema(values, Arc::new(schema)) + } - roundtrip(expected_batch, max_row_group_size) + fn one_column_roundtrip_with_schema( + values: ArrayRef, + schema: SchemaRef, + ) -> Vec { + let encodings = match values.data_type() { + DataType::Utf8 + | DataType::LargeUtf8 + | DataType::Binary + | DataType::LargeBinary => vec![ + Encoding::PLAIN, + Encoding::DELTA_BYTE_ARRAY, + Encoding::DELTA_LENGTH_BYTE_ARRAY, + ], + DataType::Int64 + | DataType::Int32 + | DataType::Int16 + | DataType::Int8 + | DataType::UInt64 + | DataType::UInt32 + | DataType::UInt16 + | DataType::UInt8 => vec![Encoding::PLAIN, Encoding::DELTA_BINARY_PACKED], + _ => vec![Encoding::PLAIN], + }; + + let expected_batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + + let row_group_sizes = [1024, SMALL_SIZE, SMALL_SIZE / 2, SMALL_SIZE / 2 + 1, 10]; + + let mut files = vec![]; + for dictionary_size in [0, 1, 1024] { + for encoding in &encodings { + for version in [WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] { + for row_group_size in row_group_sizes { + let props = WriterProperties::builder() + .set_writer_version(version) + .set_max_row_group_size(row_group_size) + .set_dictionary_enabled(dictionary_size != 0) + .set_dictionary_pagesize_limit(dictionary_size.max(1)) + .set_encoding(*encoding) + .build(); + + files.push(roundtrip_opts(&expected_batch, props)) + } + } + } + } + files } fn values_required(iter: I) @@ -1287,7 +1200,7 @@ mod tests { { let raw_values: Vec<_> = iter.into_iter().collect(); let values = Arc::new(A::from(raw_values)); - one_column_roundtrip(values, false, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, false); } fn values_optional(iter: I) @@ -1301,7 +1214,7 @@ mod tests { .map(|(i, v)| if i % 2 == 0 { None } else { Some(v) }) .collect(); let optional_values = Arc::new(A::from(optional_raw_values)); - one_column_roundtrip(optional_values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(optional_values, true); } fn required_and_optional(iter: I) @@ -1316,12 +1229,12 @@ mod tests { #[test] fn all_null_primitive_single_column() { let values = Arc::new(Int32Array::from(vec![None; SMALL_SIZE])); - one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, true); } #[test] fn null_single_column() { let values = Arc::new(NullArray::new(SMALL_SIZE)); - one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, true); // null arrays are always nullable, a test with non-nullable nulls fails } @@ -1417,7 +1330,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampSecondArray::from_vec(raw_values, None)); - one_column_roundtrip(values, false, Some(3)); + one_column_roundtrip(values, false); } #[test] @@ -1425,7 +1338,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampMillisecondArray::from_vec(raw_values, None)); - one_column_roundtrip(values, false, Some(SMALL_SIZE / 2 + 1)); + one_column_roundtrip(values, false); } #[test] @@ -1433,7 +1346,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampMicrosecondArray::from_vec(raw_values, None)); - one_column_roundtrip(values, false, Some(SMALL_SIZE / 2 + 2)); + one_column_roundtrip(values, false); } #[test] @@ -1441,7 +1354,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampNanosecondArray::from_vec(raw_values, None)); - one_column_roundtrip(values, false, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, false); } #[test] @@ -1548,7 +1461,7 @@ mod tests { builder.append_value(b"1112").unwrap(); let array = Arc::new(builder.finish()); - one_column_roundtrip(array, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(array, true); } #[test] @@ -1626,7 +1539,7 @@ mod tests { let a = ListArray::from(a_list_data); let values = Arc::new(a); - one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, true); } #[test] @@ -1652,7 +1565,7 @@ mod tests { let a = LargeListArray::from(a_list_data); let values = Arc::new(a); - one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, true); } #[test] @@ -1668,10 +1581,10 @@ mod tests { ]; let list = ListArray::from_iter_primitive::(data.clone()); - one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(Arc::new(list), true); let list = LargeListArray::from_iter_primitive::(data); - one_column_roundtrip(Arc::new(list), true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(Arc::new(list), true); } #[test] @@ -1681,7 +1594,7 @@ mod tests { let s = StructArray::from(vec![(struct_field_a, Arc::new(a_values) as ArrayRef)]); let values = Arc::new(s); - one_column_roundtrip(values, false, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, false); } #[test] @@ -1702,9 +1615,7 @@ mod tests { .collect(); // build a record batch - let expected_batch = RecordBatch::try_new(schema, vec![Arc::new(d)]).unwrap(); - - roundtrip(expected_batch, Some(SMALL_SIZE / 2)); + one_column_roundtrip_with_schema(Arc::new(d), schema); } #[test] @@ -1728,10 +1639,7 @@ mod tests { builder.append(12345678).unwrap(); let d = builder.finish(); - // build a record batch - let expected_batch = RecordBatch::try_new(schema, vec![Arc::new(d)]).unwrap(); - - roundtrip(expected_batch, Some(SMALL_SIZE / 2)); + one_column_roundtrip_with_schema(Arc::new(d), schema); } #[test] @@ -1751,16 +1659,13 @@ mod tests { .copied() .collect(); - // build a record batch - let expected_batch = RecordBatch::try_new(schema, vec![Arc::new(d)]).unwrap(); - - roundtrip(expected_batch, Some(SMALL_SIZE / 2)); + one_column_roundtrip_with_schema(Arc::new(d), schema); } #[test] fn u32_min_max() { // check values roundtrip through parquet - let values = Arc::new(UInt32Array::from_iter_values(vec![ + let src = vec![ u32::MIN, u32::MIN + 1, (i32::MAX as u32) - 1, @@ -1768,30 +1673,40 @@ mod tests { (i32::MAX as u32) + 1, u32::MAX - 1, u32::MAX, - ])); - let file = one_column_roundtrip(values, false, None); - - // check statistics are valid - let reader = SerializedFileReader::new(file).unwrap(); - let metadata = reader.metadata(); - assert_eq!(metadata.num_row_groups(), 1); - let row_group = metadata.row_group(0); - assert_eq!(row_group.num_columns(), 1); - let column = row_group.column(0); - let stats = column.statistics().unwrap(); - assert!(stats.has_min_max_set()); - if let Statistics::Int32(stats) = stats { - assert_eq!(*stats.min() as u32, u32::MIN); - assert_eq!(*stats.max() as u32, u32::MAX); - } else { - panic!("Statistics::Int32 missing") + ]; + let values = Arc::new(UInt32Array::from_iter_values(src.iter().cloned())); + let files = one_column_roundtrip(values, false); + + for file in files { + // check statistics are valid + let reader = SerializedFileReader::new(file).unwrap(); + let metadata = reader.metadata(); + + let mut row_offset = 0; + for row_group in metadata.row_groups() { + assert_eq!(row_group.num_columns(), 1); + let column = row_group.column(0); + + let num_values = column.num_values() as usize; + let src_slice = &src[row_offset..row_offset + num_values]; + row_offset += column.num_values() as usize; + + let stats = column.statistics().unwrap(); + assert!(stats.has_min_max_set()); + if let Statistics::Int32(stats) = stats { + assert_eq!(*stats.min() as u32, *src_slice.iter().min().unwrap()); + assert_eq!(*stats.max() as u32, *src_slice.iter().max().unwrap()); + } else { + panic!("Statistics::Int32 missing") + } + } } } #[test] fn u64_min_max() { // check values roundtrip through parquet - let values = Arc::new(UInt64Array::from_iter_values(vec![ + let src = vec![ u64::MIN, u64::MIN + 1, (i64::MAX as u64) - 1, @@ -1799,23 +1714,33 @@ mod tests { (i64::MAX as u64) + 1, u64::MAX - 1, u64::MAX, - ])); - let file = one_column_roundtrip(values, false, None); - - // check statistics are valid - let reader = SerializedFileReader::new(file).unwrap(); - let metadata = reader.metadata(); - assert_eq!(metadata.num_row_groups(), 1); - let row_group = metadata.row_group(0); - assert_eq!(row_group.num_columns(), 1); - let column = row_group.column(0); - let stats = column.statistics().unwrap(); - assert!(stats.has_min_max_set()); - if let Statistics::Int64(stats) = stats { - assert_eq!(*stats.min() as u64, u64::MIN); - assert_eq!(*stats.max() as u64, u64::MAX); - } else { - panic!("Statistics::Int64 missing") + ]; + let values = Arc::new(UInt64Array::from_iter_values(src.iter().cloned())); + let files = one_column_roundtrip(values, false); + + for file in files { + // check statistics are valid + let reader = SerializedFileReader::new(file).unwrap(); + let metadata = reader.metadata(); + + let mut row_offset = 0; + for row_group in metadata.row_groups() { + assert_eq!(row_group.num_columns(), 1); + let column = row_group.column(0); + + let num_values = column.num_values() as usize; + let src_slice = &src[row_offset..row_offset + num_values]; + row_offset += column.num_values() as usize; + + let stats = column.statistics().unwrap(); + assert!(stats.has_min_max_set()); + if let Statistics::Int64(stats) = stats { + assert_eq!(*stats.min() as u64, *src_slice.iter().min().unwrap()); + assert_eq!(*stats.max() as u64, *src_slice.iter().max().unwrap()); + } else { + panic!("Statistics::Int64 missing") + } + } } } @@ -1823,17 +1748,19 @@ mod tests { fn statistics_null_counts_only_nulls() { // check that null-count statistics for "only NULL"-columns are correct let values = Arc::new(UInt64Array::from(vec![None, None])); - let file = one_column_roundtrip(values, true, None); - - // check statistics are valid - let reader = SerializedFileReader::new(file).unwrap(); - let metadata = reader.metadata(); - assert_eq!(metadata.num_row_groups(), 1); - let row_group = metadata.row_group(0); - assert_eq!(row_group.num_columns(), 1); - let column = row_group.column(0); - let stats = column.statistics().unwrap(); - assert_eq!(stats.null_count(), 2); + let files = one_column_roundtrip(values, true); + + for file in files { + // check statistics are valid + let reader = SerializedFileReader::new(file).unwrap(); + let metadata = reader.metadata(); + assert_eq!(metadata.num_row_groups(), 1); + let row_group = metadata.row_group(0); + assert_eq!(row_group.num_columns(), 1); + let column = row_group.column(0); + let stats = column.statistics().unwrap(); + assert_eq!(stats.null_count(), 2); + } } #[test] @@ -1923,7 +1850,7 @@ mod tests { let array = Arc::new(list_builder.finish()); - one_column_roundtrip(array, true, Some(10)); + one_column_roundtrip(array, true); } fn row_group_sizes(metadata: &ParquetMetaData) -> Vec { diff --git a/parquet/src/arrow/async_reader.rs b/parquet/src/arrow/async_reader.rs index 640d1b81f827..5c186d7aa769 100644 --- a/parquet/src/arrow/async_reader.rs +++ b/parquet/src/arrow/async_reader.rs @@ -86,6 +86,7 @@ use std::task::{Context, Poll}; use bytes::Bytes; use futures::future::{BoxFuture, FutureExt}; +use futures::ready; use futures::stream::Stream; use parquet_format::{PageHeader, PageType}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt}; @@ -94,7 +95,9 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use crate::arrow::array_reader::{build_array_reader, RowGroupCollection}; -use crate::arrow::arrow_reader::ParquetRecordBatchReader; +use crate::arrow::arrow_reader::{ + evaluate_predicate, ParquetRecordBatchReader, RowFilter, RowSelection, +}; use crate::arrow::schema::parquet_to_arrow_schema; use crate::arrow::ProjectionMask; use crate::basic::Compression; @@ -102,13 +105,13 @@ use crate::column::page::{Page, PageIterator, PageMetadata, PageReader}; use crate::compression::{create_codec, Codec}; use crate::errors::{ParquetError, Result}; use crate::file::footer::{decode_footer, decode_metadata}; -use crate::file::metadata::ParquetMetaData; +use crate::file::metadata::{ParquetMetaData, RowGroupMetaData}; use crate::file::serialized_reader::{decode_page, read_page_header}; use crate::file::FOOTER_SIZE; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, SchemaDescriptor}; /// The asynchronous interface used by [`ParquetRecordBatchStream`] to read parquet files -pub trait AsyncFileReader { +pub trait AsyncFileReader: Send { /// Retrieve the bytes in `range` fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result>; @@ -116,10 +119,7 @@ pub trait AsyncFileReader { fn get_byte_ranges( &mut self, ranges: Vec>, - ) -> BoxFuture<'_, Result>> - where - Self: Send, - { + ) -> BoxFuture<'_, Result>> { async move { let mut result = Vec::with_capacity(ranges.len()); @@ -139,6 +139,23 @@ pub trait AsyncFileReader { fn get_metadata(&mut self) -> BoxFuture<'_, Result>>; } +impl AsyncFileReader for Box { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { + self.as_mut().get_bytes(range) + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, Result>> { + self.as_mut().get_byte_ranges(ranges) + } + + fn get_metadata(&mut self) -> BoxFuture<'_, Result>> { + self.as_mut().get_metadata() + } +} + impl AsyncFileReader for T { fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, Result> { async move { @@ -195,9 +212,13 @@ pub struct ParquetRecordBatchStreamBuilder { row_groups: Option>, projection: ProjectionMask, + + filter: Option, + + selection: Option, } -impl ParquetRecordBatchStreamBuilder { +impl ParquetRecordBatchStreamBuilder { /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided parquet file pub async fn new(mut input: T) -> Result { let metadata = input.get_metadata().await?; @@ -214,6 +235,8 @@ impl ParquetRecordBatchStreamBuilder { batch_size: 1024, row_groups: None, projection: ProjectionMask::all(), + filter: None, + selection: None, }) } @@ -253,6 +276,32 @@ impl ParquetRecordBatchStreamBuilder { } } + /// Provide a [`RowSelection] to filter out rows, and avoid fetching their + /// data into memory + /// + /// Row group filtering is applied prior to this, and rows from skipped + /// row groups should not be included in the [`RowSelection`] + /// + /// TODO: Make public once stable (#1792) + #[allow(unused)] + pub(crate) fn with_row_selection(self, selection: RowSelection) -> Self { + Self { + selection: Some(selection), + ..self + } + } + + /// Provide a [`RowFilter`] to skip decoding rows + /// + /// TODO: Make public once stable (#1792) + #[allow(unused)] + pub(crate) fn with_row_filter(self, filter: RowFilter) -> Self { + Self { + filter: Some(filter), + ..self + } + } + /// Build a new [`ParquetRecordBatchStream`] pub fn build(self) -> Result> { let num_row_groups = self.metadata.row_groups().len(); @@ -271,25 +320,122 @@ impl ParquetRecordBatchStreamBuilder { None => (0..self.metadata.row_groups().len()).collect(), }; + let reader = ReaderFactory { + input: self.input, + filter: self.filter, + metadata: self.metadata.clone(), + schema: self.schema.clone(), + }; + Ok(ParquetRecordBatchStream { + metadata: self.metadata, + batch_size: self.batch_size, row_groups, projection: self.projection, - batch_size: self.batch_size, - metadata: self.metadata, + selection: self.selection, schema: self.schema, - input: Some(self.input), + reader: Some(reader), state: StreamState::Init, }) } } +type ReadResult = Result<(ReaderFactory, Option)>; + +/// [`ReaderFactory`] is used by [`ParquetRecordBatchStream`] to create +/// [`ParquetRecordBatchReader`] +struct ReaderFactory { + metadata: Arc, + + schema: SchemaRef, + + input: T, + + filter: Option, +} + +impl ReaderFactory +where + T: AsyncFileReader + Send, +{ + /// Reads the next row group with the provided `selection`, `projection` and `batch_size` + /// + /// Note: this captures self so that the resulting future has a static lifetime + async fn read_row_group( + mut self, + row_group_idx: usize, + mut selection: Option, + projection: ProjectionMask, + batch_size: usize, + ) -> ReadResult { + // TODO: calling build_array multiple times is wasteful + let selects_any = |selection: Option<&RowSelection>| { + selection.map(|x| x.selects_any()).unwrap_or(true) + }; + + let meta = self.metadata.row_group(row_group_idx); + let mut row_group = InMemoryRowGroup { + schema: meta.schema_descr_ptr(), + row_count: meta.num_rows() as usize, + column_chunks: vec![None; meta.columns().len()], + }; + + if let Some(filter) = self.filter.as_mut() { + for predicate in filter.predicates.iter_mut() { + if !selects_any(selection.as_ref()) { + return Ok((self, None)); + } + + let predicate_projection = predicate.projection().clone(); + row_group + .fetch( + &mut self.input, + meta, + &predicate_projection, + selection.as_ref(), + ) + .await?; + + let array_reader = build_array_reader( + self.schema.clone(), + predicate_projection, + &row_group, + )?; + + selection = Some(evaluate_predicate( + batch_size, + array_reader, + selection, + predicate.as_mut(), + )?); + } + } + + if !selects_any(selection.as_ref()) { + return Ok((self, None)); + } + + row_group + .fetch(&mut self.input, meta, &projection, selection.as_ref()) + .await?; + + let reader = ParquetRecordBatchReader::new( + batch_size, + build_array_reader(self.schema.clone(), projection, &row_group)?, + selection, + ); + + Ok((self, Some(reader))) + } +} + enum StreamState { /// At the start of a new row group, or the end of the parquet stream Init, /// Decoding a batch Decoding(ParquetRecordBatchReader), /// Reading data from input - Reading(BoxFuture<'static, Result<(T, InMemoryRowGroup)>>), + Reading(BoxFuture<'static, ReadResult>), /// Error Error, } @@ -305,20 +451,23 @@ impl std::fmt::Debug for StreamState { } } -/// An asynchronous [`Stream`] of [`RecordBatch`] for a parquet file +/// An asynchronous [`Stream`] of [`RecordBatch`] for a parquet file that can be +/// constructed using [`ParquetRecordBatchStreamBuilder`] pub struct ParquetRecordBatchStream { metadata: Arc, schema: SchemaRef, - batch_size: usize, + row_groups: VecDeque, projection: ProjectionMask, - row_groups: VecDeque, + batch_size: usize, + + selection: Option, /// This is an option so it can be moved into a future - input: Option, + reader: Option>, state: StreamState, } @@ -370,101 +519,40 @@ where None => return Poll::Ready(None), }; - let metadata = self.metadata.clone(); - let mut input = match self.input.take() { - Some(input) => input, - None => { - self.state = StreamState::Error; - return Poll::Ready(Some(Err(general_err!( - "input stream lost" - )))); - } - }; - - let projection = self.projection.clone(); - self.state = StreamState::Reading( - async move { - let row_group_metadata = metadata.row_group(row_group_idx); - let mut column_chunks = - vec![None; row_group_metadata.columns().len()]; - - // TODO: Combine consecutive ranges - let fetch_ranges = (0..column_chunks.len()) - .into_iter() - .filter_map(|idx| { - if !projection.leaf_included(idx) { - None - } else { - let column = row_group_metadata.column(idx); - let (start, length) = column.byte_range(); - - Some(start as usize..(start + length) as usize) - } - }) - .collect(); - - let mut chunk_data = - input.get_byte_ranges(fetch_ranges).await?.into_iter(); - - for (idx, chunk) in column_chunks.iter_mut().enumerate() { - if !projection.leaf_included(idx) { - continue; - } - - let column = row_group_metadata.column(idx); - - if let Some(data) = chunk_data.next() { - *chunk = Some(InMemoryColumnChunk { - num_values: column.num_values(), - compression: column.compression(), - physical_type: column.column_type(), - data, - }); - } - } - - Ok(( - input, - InMemoryRowGroup { - schema: metadata.file_metadata().schema_descr_ptr(), - row_count: row_group_metadata.num_rows() as usize, - column_chunks, - }, - )) - } - .boxed(), - ) - } - StreamState::Reading(f) => { - let result = futures::ready!(f.poll_unpin(cx)); - self.state = StreamState::Init; - - let row_group: Box = match result { - Ok((input, row_group)) => { - self.input = Some(input); - Box::new(row_group) - } - Err(e) => { - self.state = StreamState::Error; - return Poll::Ready(Some(Err(e))); - } - }; + let reader = self.reader.take().expect("lost reader"); - let parquet_schema = self.metadata.file_metadata().schema_descr_ptr(); + let row_count = + self.metadata.row_group(row_group_idx).num_rows() as usize; - let array_reader = build_array_reader( - parquet_schema, - self.schema.clone(), - self.projection.clone(), - row_group, - )?; + let selection = + self.selection.as_mut().map(|s| s.split_off(row_count)); - let batch_reader = - ParquetRecordBatchReader::try_new(self.batch_size, array_reader) - .expect("reader"); + let fut = reader + .read_row_group( + row_group_idx, + selection, + self.projection.clone(), + self.batch_size, + ) + .boxed(); - self.state = StreamState::Decoding(batch_reader) + self.state = StreamState::Reading(fut) } + StreamState::Reading(f) => match ready!(f.poll_unpin(cx)) { + Ok((reader_factory, maybe_reader)) => { + self.reader = Some(reader_factory); + match maybe_reader { + // Read records from [`ParquetRecordBatchReader`] + Some(reader) => self.state = StreamState::Decoding(reader), + // All rows skipped, read next row group + None => self.state = StreamState::Init, + } + } + Err(e) => { + self.state = StreamState::Error; + return Poll::Ready(Some(Err(e))); + } + }, StreamState::Error => return Poll::Pending, } } @@ -478,9 +566,56 @@ struct InMemoryRowGroup { row_count: usize, } +impl InMemoryRowGroup { + /// Fetches the necessary column data into memory + async fn fetch( + &mut self, + input: &mut T, + metadata: &RowGroupMetaData, + projection: &ProjectionMask, + _selection: Option<&RowSelection>, + ) -> Result<()> { + // TODO: Use OffsetIndex and selection to prune pages + + let fetch_ranges = self + .column_chunks + .iter() + .enumerate() + .into_iter() + .filter_map(|(idx, chunk)| { + (chunk.is_none() && projection.leaf_included(idx)).then(|| { + let column = metadata.column(idx); + let (start, length) = column.byte_range(); + start as usize..(start + length) as usize + }) + }) + .collect(); + + let mut chunk_data = input.get_byte_ranges(fetch_ranges).await?.into_iter(); + + for (idx, chunk) in self.column_chunks.iter_mut().enumerate() { + if chunk.is_some() || !projection.leaf_included(idx) { + continue; + } + + let column = metadata.column(idx); + + if let Some(data) = chunk_data.next() { + *chunk = Some(InMemoryColumnChunk { + num_values: column.num_values(), + compression: column.compression(), + physical_type: column.column_type(), + data, + }); + } + } + Ok(()) + } +} + impl RowGroupCollection for InMemoryRowGroup { - fn schema(&self) -> Result { - Ok(self.schema.clone()) + fn schema(&self) -> SchemaDescPtr { + self.schema.clone() } fn num_rows(&self) -> usize { @@ -671,7 +806,10 @@ impl PageIterator for ColumnChunkIterator { #[cfg(test)] mod tests { use super::*; - use crate::arrow::{ArrowReader, ParquetFileArrowReader}; + use crate::arrow::arrow_reader::ArrowPredicateFn; + use crate::arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}; + use crate::file::footer::parse_metadata; + use arrow::array::{Array, ArrayRef, Int32Array, StringArray}; use arrow::error::Result as ArrowResult; use futures::TryStreamExt; use std::sync::Mutex; @@ -844,4 +982,73 @@ mod tests { assert_eq!(second_page.page_type(), crate::basic::PageType::DATA_PAGE); assert_eq!(second_page.num_values(), 8); } + + #[tokio::test] + async fn test_row_filter() { + let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]); + let b = StringArray::from_iter_values(["1", "2", "3", "4", "5", "6"]); + let c = Int32Array::from_iter(0..6); + let data = RecordBatch::try_from_iter([ + ("a", Arc::new(a) as ArrayRef), + ("b", Arc::new(b) as ArrayRef), + ("c", Arc::new(c) as ArrayRef), + ]) + .unwrap(); + + let mut buf = Vec::with_capacity(1024); + let mut writer = ArrowWriter::try_new(&mut buf, data.schema(), None).unwrap(); + writer.write(&data).unwrap(); + writer.close().unwrap(); + + let data: Bytes = buf.into(); + let metadata = parse_metadata(&data).unwrap(); + let parquet_schema = metadata.file_metadata().schema_descr_ptr(); + + let test = TestReader { + data, + metadata: Arc::new(metadata), + requests: Default::default(), + }; + let requests = test.requests.clone(); + + let a_filter = ArrowPredicateFn::new( + ProjectionMask::leaves(&parquet_schema, vec![0]), + |batch| arrow::compute::eq_dyn_utf8_scalar(batch.column(0), "b"), + ); + + let b_filter = ArrowPredicateFn::new( + ProjectionMask::leaves(&parquet_schema, vec![1]), + |batch| arrow::compute::eq_dyn_utf8_scalar(batch.column(0), "4"), + ); + + let filter = RowFilter::new(vec![Box::new(a_filter), Box::new(b_filter)]); + + let mask = ProjectionMask::leaves(&parquet_schema, vec![0, 2]); + let stream = ParquetRecordBatchStreamBuilder::new(test) + .await + .unwrap() + .with_projection(mask.clone()) + .with_batch_size(1024) + .with_row_filter(filter) + .build() + .unwrap(); + + let batches: Vec<_> = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 1); + assert_eq!(batch.num_columns(), 2); + + let col = batch.column(0); + let val = col.as_any().downcast_ref::().unwrap().value(0); + assert_eq!(val, "b"); + + let col = batch.column(1); + let val = col.as_any().downcast_ref::().unwrap().value(0); + assert_eq!(val, 3); + + // Should only have made 3 requests + assert_eq!(requests.lock().unwrap().len(), 3); + } } diff --git a/parquet/src/arrow/buffer/converter.rs b/parquet/src/arrow/buffer/converter.rs index 93609308d2ba..aeca548bde72 100644 --- a/parquet/src/arrow/buffer/converter.rs +++ b/parquet/src/arrow/buffer/converter.rs @@ -17,18 +17,18 @@ use crate::data_type::{ByteArray, FixedLenByteArray, Int96}; use arrow::array::{ - Array, ArrayRef, BinaryArray, BinaryBuilder, Decimal128Array, FixedSizeBinaryArray, - FixedSizeBinaryBuilder, IntervalDayTimeArray, IntervalDayTimeBuilder, - IntervalYearMonthArray, IntervalYearMonthBuilder, LargeBinaryArray, - LargeBinaryBuilder, LargeStringArray, LargeStringBuilder, StringArray, StringBuilder, - TimestampNanosecondArray, + Array, ArrayRef, Decimal128Array, FixedSizeBinaryArray, FixedSizeBinaryBuilder, + IntervalDayTimeArray, IntervalDayTimeBuilder, IntervalYearMonthArray, + IntervalYearMonthBuilder, TimestampNanosecondArray, }; -use std::convert::{From, TryInto}; use std::sync::Arc; use crate::errors::Result; use std::marker::PhantomData; +#[cfg(test)] +use arrow::array::{StringArray, StringBuilder}; + /// A converter is used to consume record reader's content and convert it to arrow /// primitive array. pub trait Converter { @@ -185,8 +185,10 @@ impl Converter>, TimestampNanosecondArray> for Int96ArrayConve } } +#[cfg(test)] pub struct Utf8ArrayConverter {} +#[cfg(test)] impl Converter>, StringArray> for Utf8ArrayConverter { fn convert(&self, source: Vec>) -> Result { let data_size = source @@ -206,70 +208,9 @@ impl Converter>, StringArray> for Utf8ArrayConverter { } } -pub struct LargeUtf8ArrayConverter {} - -impl Converter>, LargeStringArray> for LargeUtf8ArrayConverter { - fn convert(&self, source: Vec>) -> Result { - let data_size = source - .iter() - .map(|x| x.as_ref().map(|b| b.len()).unwrap_or(0)) - .sum(); - - let mut builder = LargeStringBuilder::with_capacity(source.len(), data_size); - for v in source { - match v { - Some(array) => builder.append_value(array.as_utf8()?), - None => builder.append_null(), - } - } - - Ok(builder.finish()) - } -} - -pub struct BinaryArrayConverter {} - -impl Converter>, BinaryArray> for BinaryArrayConverter { - fn convert(&self, source: Vec>) -> Result { - let mut builder = BinaryBuilder::new(source.len()); - for v in source { - match v { - Some(array) => builder.append_value(array.data()), - None => builder.append_null(), - } - } - - Ok(builder.finish()) - } -} - -pub struct LargeBinaryArrayConverter {} - -impl Converter>, LargeBinaryArray> for LargeBinaryArrayConverter { - fn convert(&self, source: Vec>) -> Result { - let mut builder = LargeBinaryBuilder::new(source.len()); - for v in source { - match v { - Some(array) => builder.append_value(array.data()), - None => builder.append_null(), - } - } - - Ok(builder.finish()) - } -} - +#[cfg(test)] pub type Utf8Converter = ArrayRefConverter>, StringArray, Utf8ArrayConverter>; -pub type LargeUtf8Converter = - ArrayRefConverter>, LargeStringArray, LargeUtf8ArrayConverter>; -pub type BinaryConverter = - ArrayRefConverter>, BinaryArray, BinaryArrayConverter>; -pub type LargeBinaryConverter = ArrayRefConverter< - Vec>, - LargeBinaryArray, - LargeBinaryArrayConverter, ->; pub type Int96Converter = ArrayRefConverter>, TimestampNanosecondArray, Int96ArrayConverter>; @@ -299,32 +240,6 @@ pub type DecimalFixedLengthByteArrayConverter = ArrayRefConverter< pub type DecimalByteArrayConvert = ArrayRefConverter>, Decimal128Array, DecimalArrayConverter>; -pub struct FromConverter { - _source: PhantomData, - _dest: PhantomData, -} - -impl FromConverter -where - T: From, -{ - pub fn new() -> Self { - Self { - _source: PhantomData, - _dest: PhantomData, - } - } -} - -impl Converter for FromConverter -where - T: From, -{ - fn convert(&self, source: S) -> Result { - Ok(T::from(source)) - } -} - pub struct ArrayRefConverter { _source: PhantomData, _array: PhantomData, diff --git a/parquet/src/arrow/buffer/dictionary_buffer.rs b/parquet/src/arrow/buffer/dictionary_buffer.rs index b64b2946b91a..ae9e3590de3f 100644 --- a/parquet/src/arrow/buffer/dictionary_buffer.rs +++ b/parquet/src/arrow/buffer/dictionary_buffer.rs @@ -49,6 +49,7 @@ impl Default for DictionaryBuffer { impl DictionaryBuffer { + #[allow(unused)] pub fn len(&self) -> usize { match self { Self::Dict { keys, .. } => keys.len(), diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index 53eeab9a514c..2d65db77fa69 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -408,12 +408,12 @@ mod tests { let mut encoder = RleEncoder::new(1, 1024); for _ in 0..len { let bool = rng.gen_bool(0.8); - assert!(encoder.put(bool as u64).unwrap()); + encoder.put(bool as u64); expected.append(bool); } assert_eq!(expected.len(), len); - let encoded = encoder.consume().unwrap(); + let encoded = encoder.consume(); let mut decoder = PackedDecoder::new(); decoder.set_data(Encoding::RLE, ByteBufferPtr::new(encoded)); @@ -444,7 +444,7 @@ mod tests { let mut total_value = 0; for _ in 0..len { let bool = rng.gen_bool(0.8); - assert!(encoder.put(bool as u64).unwrap()); + encoder.put(bool as u64); expected.append(bool); if bool { total_value += 1; @@ -452,7 +452,7 @@ mod tests { } assert_eq!(expected.len(), len); - let encoded = encoder.consume().unwrap(); + let encoded = encoder.consume(); let mut decoder = PackedDecoder::new(); decoder.set_data(Encoding::RLE, ByteBufferPtr::new(encoded)); diff --git a/parquet/src/arrow/record_reader/mod.rs b/parquet/src/arrow/record_reader/mod.rs index b68f59d514f2..18b4c9e07026 100644 --- a/parquet/src/arrow/record_reader/mod.rs +++ b/parquet/src/arrow/record_reader/mod.rs @@ -198,12 +198,6 @@ where self.num_records += buffered_records; self.num_values += buffered_values; - self.consume_def_levels(); - self.consume_rep_levels(); - self.consume_record_data(); - self.consume_bitmap(); - self.reset(); - let remaining = num_records - buffered_records; if remaining == 0 { @@ -220,6 +214,7 @@ where } /// Returns number of records stored in buffer. + #[allow(unused)] pub fn num_records(&self) -> usize { self.num_records } @@ -279,11 +274,6 @@ where .map(|levels| levels.split_bitmask(self.num_values)) } - /// Returns column reader. - pub(crate) fn column_reader(&self) -> Option<&ColumnReader> { - self.column_reader.as_ref() - } - /// Try to read one batch of data. fn read_one_batch(&mut self, batch_size: usize) -> Result { let rep_levels = self @@ -796,4 +786,186 @@ mod tests { assert_eq!(record_reader.num_records(), 8); assert_eq!(record_reader.num_values(), 14); } + + #[test] + fn test_skip_required_records() { + // Construct column schema + let message_type = " + message test_schema { + REQUIRED INT32 leaf; + } + "; + let desc = parse_message_type(message_type) + .map(|t| SchemaDescriptor::new(Arc::new(t))) + .map(|s| s.column(0)) + .unwrap(); + + // Construct record reader + let mut record_reader = RecordReader::::new(desc.clone()); + + // First page + + // Records data: + // test_schema + // leaf: 4 + // test_schema + // leaf: 7 + // test_schema + // leaf: 6 + // test_schema + // left: 3 + // test_schema + // left: 2 + { + let values = [4, 7, 6, 3, 2]; + let mut pb = DataPageBuilderImpl::new(desc.clone(), 5, true); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(2).unwrap()); + assert_eq!(0, record_reader.num_records()); + assert_eq!(0, record_reader.num_values()); + assert_eq!(3, record_reader.read_records(3).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + } + + // Second page + + // Records data: + // test_schema + // leaf: 8 + // test_schema + // leaf: 9 + { + let values = [8, 9]; + let mut pb = DataPageBuilderImpl::new(desc, 2, true); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(10).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + assert_eq!(0, record_reader.read_records(10).unwrap()); + } + + let mut bb = Int32BufferBuilder::new(3); + bb.append_slice(&[6, 3, 2]); + let expected_buffer = bb.finish(); + assert_eq!(expected_buffer, record_reader.consume_record_data()); + assert_eq!(None, record_reader.consume_def_levels()); + assert_eq!(None, record_reader.consume_bitmap()); + } + + #[test] + fn test_skip_optional_records() { + // Construct column schema + let message_type = " + message test_schema { + OPTIONAL Group test_struct { + OPTIONAL INT32 leaf; + } + } + "; + + let desc = parse_message_type(message_type) + .map(|t| SchemaDescriptor::new(Arc::new(t))) + .map(|s| s.column(0)) + .unwrap(); + + // Construct record reader + let mut record_reader = RecordReader::::new(desc.clone()); + + // First page + + // Records data: + // test_schema + // test_struct + // test_schema + // test_struct + // leaf: 7 + // test_schema + // test_schema + // test_struct + // leaf: 6 + // test_schema + // test_struct + // leaf: 6 + { + let values = [7, 6, 3]; + //empty, non-empty, empty, non-empty, non-empty + let def_levels = [1i16, 2i16, 0i16, 2i16, 2i16]; + let mut pb = DataPageBuilderImpl::new(desc.clone(), 5, true); + pb.add_def_levels(2, &def_levels); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(2).unwrap()); + assert_eq!(0, record_reader.num_records()); + assert_eq!(0, record_reader.num_values()); + assert_eq!(3, record_reader.read_records(3).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + } + + // Second page + + // Records data: + // test_schema + // test_schema + // test_struct + // left: 8 + { + let values = [8]; + //empty, non-empty + let def_levels = [0i16, 2i16]; + let mut pb = DataPageBuilderImpl::new(desc, 2, true); + pb.add_def_levels(2, &def_levels); + pb.add_values::(Encoding::PLAIN, &values); + let page = pb.consume(); + + let page_reader = Box::new(InMemoryPageReader::new(vec![page])); + record_reader.set_page_reader(page_reader).unwrap(); + assert_eq!(2, record_reader.skip_records(10).unwrap()); + assert_eq!(3, record_reader.num_records()); + assert_eq!(3, record_reader.num_values()); + assert_eq!(0, record_reader.read_records(10).unwrap()); + } + + // Verify result def levels + let mut bb = Int16BufferBuilder::new(7); + bb.append_slice(&[0i16, 2i16, 2i16]); + let expected_def_levels = bb.finish(); + assert_eq!( + Some(expected_def_levels), + record_reader.consume_def_levels() + ); + + // Verify bitmap + let expected_valid = &[false, true, true]; + let expected_buffer = Buffer::from_iter(expected_valid.iter().cloned()); + let expected_bitmap = Bitmap::from(expected_buffer); + assert_eq!(Some(expected_bitmap), record_reader.consume_bitmap()); + + // Verify result record data + let actual = record_reader.consume_record_data(); + let actual_values = actual.typed_data::(); + + let expected = &[0, 6, 3]; + assert_eq!(actual_values.len(), expected.len()); + + // Only validate valid values are equal + let iter = expected_valid.iter().zip(actual_values).zip(expected); + for ((valid, actual), expected) in iter { + if *valid { + assert_eq!(actual, expected) + } + } + } } diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 53d67d380412..01aefcd48e1d 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -73,7 +73,7 @@ pub fn parquet_to_arrow_schema_by_columns( // Add the Arrow metadata to the Parquet metadata skipping keys that collide if let Some(arrow_schema) = &maybe_schema { arrow_schema.metadata().iter().for_each(|(k, v)| { - metadata.entry(k.clone()).or_insert(v.clone()); + metadata.entry(k.clone()).or_insert_with(|| v.clone()); }); } @@ -100,7 +100,7 @@ fn get_arrow_schema_from_metadata(encoded_meta: &str) -> Result { Ok(message) => message .header_as_schema() .map(arrow::ipc::convert::fb_to_schema) - .ok_or(arrow_err!("the message is not Arrow Schema")), + .ok_or_else(|| arrow_err!("the message is not Arrow Schema")), Err(err) => { // The flatbuffers implementation returns an error on verification error. Err(arrow_err!( @@ -380,7 +380,8 @@ fn arrow_to_parquet_type(field: &Field) -> Result { .with_length(*length) .build() } - DataType::Decimal(precision, scale) | DataType::Decimal256(precision, scale) => { + DataType::Decimal128(precision, scale) + | DataType::Decimal256(precision, scale) => { // Decimal precision determines the Parquet physical type to use. // TODO(ARROW-12018): Enable the below after ARROW-10818 Decimal support // @@ -549,10 +550,10 @@ mod tests { parquet_to_arrow_schema(&parquet_schema, None).unwrap(); let arrow_fields = vec![ - Field::new("decimal1", DataType::Decimal(4,2), false), - Field::new("decimal2", DataType::Decimal(12,2), false), - Field::new("decimal3", DataType::Decimal(30,2), false), - Field::new("decimal4", DataType::Decimal(33,2), false), + Field::new("decimal1", DataType::Decimal128(4, 2), false), + Field::new("decimal2", DataType::Decimal128(12, 2), false), + Field::new("decimal3", DataType::Decimal128(30, 2), false), + Field::new("decimal4", DataType::Decimal128(33, 2), false), ]; assert_eq!(&arrow_fields, converted_arrow_schema.fields()); } @@ -1575,9 +1576,9 @@ mod tests { // true, // ), Field::new("c35", DataType::Null, true), - Field::new("c36", DataType::Decimal(2, 1), false), - Field::new("c37", DataType::Decimal(50, 20), false), - Field::new("c38", DataType::Decimal(18, 12), true), + Field::new("c36", DataType::Decimal128(2, 1), false), + Field::new("c37", DataType::Decimal128(50, 20), false), + Field::new("c38", DataType::Decimal128(18, 12), true), Field::new( "c39", DataType::Map( diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs index 4bf6876d09da..c05a13565b12 100644 --- a/parquet/src/arrow/schema/primitive.rs +++ b/parquet/src/arrow/schema/primitive.rs @@ -112,7 +112,7 @@ fn decimal_type(scale: i32, precision: i32) -> Result { .try_into() .map_err(|_| arrow_err!("precision cannot be negative: {}", precision))?; - Ok(DataType::Decimal(precision, scale)) + Ok(DataType::Decimal128(precision, scale)) } fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result { @@ -224,7 +224,7 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) -> Result Result { +fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> Result { match (info.logical_type(), info.converted_type()) { (Some(LogicalType::String), _) => Ok(DataType::Utf8), (Some(LogicalType::Json), _) => Ok(DataType::Binary), @@ -235,8 +235,12 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32 ) -> Result< (None, ConvertedType::BSON) => Ok(DataType::Binary), (None, ConvertedType::ENUM) => Ok(DataType::Binary), (None, ConvertedType::UTF8) => Ok(DataType::Utf8), - (Some(LogicalType::Decimal {precision, scale}), _) => Ok(DataType::Decimal(precision as usize, scale as usize)), - (None, ConvertedType::DECIMAL) => Ok(DataType::Decimal(precision as usize, scale as usize)), + (Some(LogicalType::Decimal { precision, scale }), _) => { + Ok(DataType::Decimal128(precision as usize, scale as usize)) + } + (None, ConvertedType::DECIMAL) => { + Ok(DataType::Decimal128(precision as usize, scale as usize)) + } (logical, converted) => Err(arrow_err!( "Unable to convert parquet BYTE_ARRAY logical type {:?} or converted type {}", logical, diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index 0cf1d5121b7e..7adbc8c1b6d0 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -18,7 +18,7 @@ //! Contains Rust mappings for Thrift definition. //! Refer to `parquet.thrift` file to see raw definitions. -use std::{convert, fmt, result, str}; +use std::{fmt, result, str}; use parquet_format as parquet; @@ -42,6 +42,7 @@ pub use parquet_format::{ /// For example INT16 is not included as a type since a good encoding of INT32 /// would handle this. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[allow(non_camel_case_types)] pub enum Type { BOOLEAN, INT32, @@ -62,7 +63,8 @@ pub enum Type { /// /// This struct was renamed from `LogicalType` in version 4.0.0. /// If targeting Parquet format 2.4.0 or above, please use [LogicalType] instead. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum ConvertedType { NONE, /// A BYTE_ARRAY actually contains UTF8 encoded chars. @@ -163,7 +165,7 @@ pub enum ConvertedType { /// This is an *entirely new* struct as of version /// 4.0.0. The struct previously named `LogicalType` was renamed to /// [`ConvertedType`]. Please see the README.md for more details. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum LogicalType { String, Map, @@ -196,7 +198,8 @@ pub enum LogicalType { // Mirrors `parquet::FieldRepetitionType` /// Representation of field types in schema. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum Repetition { /// Field is required (can not be null) and each record has exactly 1 value. REQUIRED, @@ -213,6 +216,7 @@ pub enum Repetition { /// Not all encodings are valid for all types. These enums are also used to specify the /// encoding of definition and repetition levels. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] +#[allow(non_camel_case_types)] pub enum Encoding { /// Default byte encoding. /// - BOOLEAN - 1 bit per value, 0 is false; 1 is true. @@ -277,7 +281,7 @@ pub enum Encoding { // Mirrors `parquet::CompressionCodec` /// Supported compression algorithms. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Compression { UNCOMPRESSED, SNAPPY, @@ -293,7 +297,8 @@ pub enum Compression { /// Available data pages for Parquet file format. /// Note that some of the page types may not be supported. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum PageType { DATA_PAGE, INDEX_PAGE, @@ -312,7 +317,8 @@ pub enum PageType { /// /// See reference in /// -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum SortOrder { /// Signed (either value or legacy byte-wise) comparison. SIGNED, @@ -327,7 +333,8 @@ pub enum SortOrder { /// /// If column order is undefined, then it is the legacy behaviour and all values should /// be compared as signed values/bytes. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum ColumnOrder { /// Column uses the order defined by its logical or physical type /// (if there is no logical type), parquet-format 2.4.0+. @@ -489,7 +496,7 @@ impl fmt::Display for ColumnOrder { // ---------------------------------------------------------------------- // parquet::Type <=> Type conversion -impl convert::From for Type { +impl From for Type { fn from(value: parquet::Type) -> Self { match value { parquet::Type::Boolean => Type::BOOLEAN, @@ -504,7 +511,7 @@ impl convert::From for Type { } } -impl convert::From for parquet::Type { +impl From for parquet::Type { fn from(value: Type) -> Self { match value { Type::BOOLEAN => parquet::Type::Boolean, @@ -522,7 +529,7 @@ impl convert::From for parquet::Type { // ---------------------------------------------------------------------- // parquet::ConvertedType <=> ConvertedType conversion -impl convert::From> for ConvertedType { +impl From> for ConvertedType { fn from(option: Option) -> Self { match option { None => ConvertedType::NONE, @@ -558,7 +565,7 @@ impl convert::From> for ConvertedType { } } -impl convert::From for Option { +impl From for Option { fn from(value: ConvertedType) -> Self { match value { ConvertedType::NONE => None, @@ -595,7 +602,7 @@ impl convert::From for Option { // ---------------------------------------------------------------------- // parquet::LogicalType <=> LogicalType conversion -impl convert::From for LogicalType { +impl From for LogicalType { fn from(value: parquet::LogicalType) -> Self { match value { parquet::LogicalType::STRING(_) => LogicalType::String, @@ -627,7 +634,7 @@ impl convert::From for LogicalType { } } -impl convert::From for parquet::LogicalType { +impl From for parquet::LogicalType { fn from(value: LogicalType) -> Self { match value { LogicalType::String => parquet::LogicalType::STRING(Default::default()), @@ -723,7 +730,7 @@ impl From> for ConvertedType { // ---------------------------------------------------------------------- // parquet::FieldRepetitionType <=> Repetition conversion -impl convert::From for Repetition { +impl From for Repetition { fn from(value: parquet::FieldRepetitionType) -> Self { match value { parquet::FieldRepetitionType::Required => Repetition::REQUIRED, @@ -733,7 +740,7 @@ impl convert::From for Repetition { } } -impl convert::From for parquet::FieldRepetitionType { +impl From for parquet::FieldRepetitionType { fn from(value: Repetition) -> Self { match value { Repetition::REQUIRED => parquet::FieldRepetitionType::Required, @@ -746,7 +753,7 @@ impl convert::From for parquet::FieldRepetitionType { // ---------------------------------------------------------------------- // parquet::Encoding <=> Encoding conversion -impl convert::From for Encoding { +impl From for Encoding { fn from(value: parquet::Encoding) -> Self { match value { parquet::Encoding::Plain => Encoding::PLAIN, @@ -762,7 +769,7 @@ impl convert::From for Encoding { } } -impl convert::From for parquet::Encoding { +impl From for parquet::Encoding { fn from(value: Encoding) -> Self { match value { Encoding::PLAIN => parquet::Encoding::Plain, @@ -781,7 +788,7 @@ impl convert::From for parquet::Encoding { // ---------------------------------------------------------------------- // parquet::CompressionCodec <=> Compression conversion -impl convert::From for Compression { +impl From for Compression { fn from(value: parquet::CompressionCodec) -> Self { match value { parquet::CompressionCodec::Uncompressed => Compression::UNCOMPRESSED, @@ -795,7 +802,7 @@ impl convert::From for Compression { } } -impl convert::From for parquet::CompressionCodec { +impl From for parquet::CompressionCodec { fn from(value: Compression) -> Self { match value { Compression::UNCOMPRESSED => parquet::CompressionCodec::Uncompressed, @@ -812,7 +819,7 @@ impl convert::From for parquet::CompressionCodec { // ---------------------------------------------------------------------- // parquet::PageType <=> PageType conversion -impl convert::From for PageType { +impl From for PageType { fn from(value: parquet::PageType) -> Self { match value { parquet::PageType::DataPage => PageType::DATA_PAGE, @@ -823,7 +830,7 @@ impl convert::From for PageType { } } -impl convert::From for parquet::PageType { +impl From for parquet::PageType { fn from(value: PageType) -> Self { match value { PageType::DATA_PAGE => parquet::PageType::DataPage, diff --git a/parquet/src/bin/parquet-fromcsv.rs b/parquet/src/bin/parquet-fromcsv.rs index aa1d50563cd9..827aa7311f58 100644 --- a/parquet/src/bin/parquet-fromcsv.rs +++ b/parquet/src/bin/parquet-fromcsv.rs @@ -439,7 +439,7 @@ mod tests { // test default values assert_eq!(args.input_format, CsvDialect::Csv); assert_eq!(args.batch_size, 1000); - assert_eq!(args.has_header, false); + assert!(!args.has_header); assert_eq!(args.delimiter, None); assert_eq!(args.get_delimiter(), b','); assert_eq!(args.record_terminator, None); @@ -553,7 +553,7 @@ mod tests { Field::new("field5", DataType::Utf8, false), ])); - let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{:?}", reader_builder); assert_debug_text(&builder_debug, "has_header", "false"); assert_debug_text(&builder_debug, "delimiter", "Some(44)"); @@ -585,7 +585,7 @@ mod tests { Field::new("field4", DataType::Utf8, false), Field::new("field5", DataType::Utf8, false), ])); - let reader_builder = configure_reader_builder(&args, arrow_schema.clone()); + let reader_builder = configure_reader_builder(&args, arrow_schema); let builder_debug = format!("{:?}", reader_builder); assert_debug_text(&builder_debug, "has_header", "true"); assert_debug_text(&builder_debug, "delimiter", "Some(9)"); diff --git a/parquet/src/bin/parquet-read.rs b/parquet/src/bin/parquet-read.rs index 0530afaa786a..927d96f8cde7 100644 --- a/parquet/src/bin/parquet-read.rs +++ b/parquet/src/bin/parquet-read.rs @@ -93,6 +93,6 @@ fn print_row(row: &Row, json: bool) { if json { println!("{}", row.to_json_value()) } else { - println!("{}", row.to_string()); + println!("{}", row); } } diff --git a/parquet/src/bin/parquet-schema.rs b/parquet/src/bin/parquet-schema.rs index b875b0e7102b..68c52def7c44 100644 --- a/parquet/src/bin/parquet-schema.rs +++ b/parquet/src/bin/parquet-schema.rs @@ -67,9 +67,9 @@ fn main() { println!("Metadata for file: {}", &filename); println!(); if verbose { - print_parquet_metadata(&mut std::io::stdout(), &metadata); + print_parquet_metadata(&mut std::io::stdout(), metadata); } else { - print_file_metadata(&mut std::io::stdout(), &metadata.file_metadata()); + print_file_metadata(&mut std::io::stdout(), metadata.file_metadata()); } } } diff --git a/parquet/src/column/page.rs b/parquet/src/column/page.rs index c61e9c0b343e..1658797cee7d 100644 --- a/parquet/src/column/page.rs +++ b/parquet/src/column/page.rs @@ -174,6 +174,12 @@ pub struct PageWriteSpec { pub bytes_written: u64, } +impl Default for PageWriteSpec { + fn default() -> Self { + Self::new() + } +} + impl PageWriteSpec { /// Creates new spec with default page write metrics. pub fn new() -> Self { diff --git a/parquet/src/column/reader.rs b/parquet/src/column/reader.rs index 8e0fa5a4d5aa..1432c72b53f1 100644 --- a/parquet/src/column/reader.rs +++ b/parquet/src/column/reader.rs @@ -28,7 +28,7 @@ use crate::column::reader::decoder::{ use crate::data_type::*; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -use crate::util::bit_util::{ceil, num_required_bits}; +use crate::util::bit_util::{ceil, num_required_bits, read_num_bytes}; use crate::util::memory::ByteBufferPtr; pub(crate) mod decoder; @@ -520,7 +520,7 @@ fn parse_v1_level( match encoding { Encoding::RLE => { let i32_size = std::mem::size_of::(); - let data_size = read_num_bytes!(i32, i32_size, buf.as_ref()) as usize; + let data_size = read_num_bytes::(i32_size, buf.as_ref()) as usize; Ok((i32_size + data_size, buf.range(i32_size, data_size))) } Encoding::BIT_PACKED => { @@ -544,8 +544,8 @@ mod tests { use crate::basic::Type as PhysicalType; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::test_common::make_pages; use crate::util::test_common::page_util::InMemoryPageReader; + use crate::util::test_common::rand_gen::make_pages; const NUM_LEVELS: usize = 128; const NUM_PAGES: usize = 2; @@ -1231,6 +1231,7 @@ mod tests { // Helper function for the general case of `read_batch()` where `values`, // `def_levels` and `rep_levels` are always provided with enough space. + #[allow(clippy::too_many_arguments)] fn test_read_batch_general( &mut self, desc: ColumnDescPtr, @@ -1262,6 +1263,7 @@ mod tests { // Helper function to test `read_batch()` method with custom buffers for values, // definition and repetition levels. + #[allow(clippy::too_many_arguments)] fn test_read_batch( &mut self, desc: ColumnDescPtr, diff --git a/parquet/src/column/writer/encoder.rs b/parquet/src/column/writer/encoder.rs index 54003732a06a..4fb4f210e146 100644 --- a/parquet/src/column/writer/encoder.rs +++ b/parquet/src/column/writer/encoder.rs @@ -30,16 +30,21 @@ use crate::util::memory::ByteBufferPtr; /// A collection of [`ParquetValueType`] encoded by a [`ColumnValueEncoder`] pub trait ColumnValues { - /// The underlying value type - type T: ParquetValueType; - /// The number of values in this collection fn len(&self) -> usize; +} - /// Returns the min and max values in this collection, skipping any NaN values - /// - /// Returns `None` if no values found - fn min_max(&self, descr: &ColumnDescriptor) -> Option<(&Self::T, &Self::T)>; +#[cfg(any(feature = "arrow", test))] +impl ColumnValues for T { + fn len(&self) -> usize { + arrow::array::Array::len(self) + } +} + +impl ColumnValues for [T] { + fn len(&self) -> usize { + self.len() + } } /// The encoded data for a dictionary page @@ -67,7 +72,16 @@ pub trait ColumnValueEncoder { type T: ParquetValueType; /// The values encoded by this encoder - type Values: ColumnValues + ?Sized; + type Values: ColumnValues + ?Sized; + + /// Returns the min and max values in this collection, skipping any NaN values + /// + /// Returns `None` if no values found + fn min_max( + &self, + values: &Self::Values, + value_indices: Option<&[usize]>, + ) -> Option<(Self::T, Self::T)>; /// Create a new [`ColumnValueEncoder`] fn try_new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result @@ -77,6 +91,9 @@ pub trait ColumnValueEncoder { /// Write the corresponding values to this [`ColumnValueEncoder`] fn write(&mut self, values: &Self::Values, offset: usize, len: usize) -> Result<()>; + /// Write the values at the indexes in `indices` to this [`ColumnValueEncoder`] + fn write_gather(&mut self, values: &Self::Values, indices: &[usize]) -> Result<()>; + /// Returns the number of buffered values fn num_values(&self) -> usize; @@ -110,11 +127,40 @@ pub struct ColumnValueEncoderImpl { max_value: Option, } +impl ColumnValueEncoderImpl { + fn write_slice(&mut self, slice: &[T::T]) -> Result<()> { + if self.statistics_enabled == EnabledStatistics::Page { + if let Some((min, max)) = self.min_max(slice, None) { + update_min(&self.descr, &min, &mut self.min_value); + update_max(&self.descr, &max, &mut self.max_value); + } + } + + match &mut self.dict_encoder { + Some(encoder) => encoder.put(slice), + _ => self.encoder.put(slice), + } + } +} + impl ColumnValueEncoder for ColumnValueEncoderImpl { type T = T::T; type Values = [T::T]; + fn min_max( + &self, + values: &Self::Values, + value_indices: Option<&[usize]>, + ) -> Option<(Self::T, Self::T)> { + match value_indices { + Some(indices) => { + get_min_max(&self.descr, indices.iter().map(|x| &values[*x])) + } + None => get_min_max(&self.descr, values.iter()), + } + } + fn try_new(descr: &ColumnDescPtr, props: &WriterProperties) -> Result { let dict_supported = props.dictionary_enabled(descr.path()) && has_dictionary_support(T::get_physical_type(), props); @@ -122,7 +168,6 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { // Set either main encoder or fallback encoder. let encoder = get_encoder( - descr.clone(), props .encoding(descr.path()) .unwrap_or_else(|| fallback_encoding(T::get_physical_type(), props)), @@ -152,17 +197,12 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { ) })?; - if self.statistics_enabled == EnabledStatistics::Page { - if let Some((min, max)) = slice.min_max(&self.descr) { - update_min(&self.descr, min, &mut self.min_value); - update_max(&self.descr, max, &mut self.max_value); - } - } + self.write_slice(slice) + } - match &mut self.dict_encoder { - Some(encoder) => encoder.put(slice), - _ => self.encoder.put(slice), - } + fn write_gather(&mut self, values: &Self::Values, indices: &[usize]) -> Result<()> { + let slice: Vec<_> = indices.iter().map(|idx| values[*idx].clone()).collect(); + self.write_slice(&slice) } fn num_values(&self) -> usize { @@ -221,36 +261,30 @@ impl ColumnValueEncoder for ColumnValueEncoderImpl { } } -impl ColumnValues for [T] { - type T = T; - - fn len(&self) -> usize { - self.len() - } - - fn min_max(&self, descr: &ColumnDescriptor) -> Option<(&T, &T)> { - let mut iter = self.iter(); - - let first = loop { - let next = iter.next()?; - if !is_nan(next) { - break next; - } - }; +fn get_min_max<'a, T, I>(descr: &ColumnDescriptor, mut iter: I) -> Option<(T, T)> +where + T: ParquetValueType + 'a, + I: Iterator, +{ + let first = loop { + let next = iter.next()?; + if !is_nan(next) { + break next; + } + }; - let mut min = first; - let mut max = first; - for val in iter { - if is_nan(val) { - continue; - } - if compare_greater(descr, min, val) { - min = val; - } - if compare_greater(descr, val, max) { - max = val; - } + let mut min = first; + let mut max = first; + for val in iter { + if is_nan(val) { + continue; + } + if compare_greater(descr, min, val) { + min = val; + } + if compare_greater(descr, val, max) { + max = val; } - Some((min, max)) } + Some((min.clone(), max.clone())) } diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs index 9a371bc27106..669cacee6460 100644 --- a/parquet/src/column/writer/mod.rs +++ b/parquet/src/column/writer/mod.rs @@ -27,7 +27,7 @@ use crate::column::writer::encoder::{ use crate::compression::{create_codec, Codec}; use crate::data_type::private::ParquetValueType; use crate::data_type::*; -use crate::encodings::levels::{max_buffer_size, LevelEncoder}; +use crate::encodings::levels::LevelEncoder; use crate::errors::{ParquetError, Result}; use crate::file::metadata::{ColumnIndexBuilder, OffsetIndexBuilder}; use crate::file::properties::EnabledStatistics; @@ -153,6 +153,29 @@ type ColumnCloseResult = ( Option, ); +// Metrics per page +#[derive(Default)] +struct PageMetrics { + num_buffered_values: u32, + num_buffered_rows: u32, + num_page_nulls: u64, +} + +// Metrics per column writer +struct ColumnMetrics { + total_bytes_written: u64, + total_rows_written: u64, + total_uncompressed_size: u64, + total_compressed_size: u64, + total_num_values: u64, + dictionary_page_offset: Option, + data_page_offset: Option, + min_column_value: Option, + max_column_value: Option, + num_column_nulls: u64, + column_distinct_count: Option, +} + /// Typed column writer for a primitive column. pub type ColumnWriterImpl<'a, T> = GenericColumnWriter<'a, ColumnValueEncoderImpl>; @@ -167,31 +190,13 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> { compressor: Option>, encoder: E, - // Metrics per page - /// The number of values including nulls in the in-progress data page - num_buffered_values: u32, - /// The number of rows in the in-progress data page - num_buffered_rows: u32, - /// The number of nulls in the in-progress data page - num_page_nulls: u64, - + page_metrics: PageMetrics, // Metrics per column writer - total_bytes_written: u64, - total_rows_written: u64, - total_uncompressed_size: u64, - total_compressed_size: u64, - total_num_values: u64, - dictionary_page_offset: Option, - data_page_offset: Option, - min_column_value: Option, - max_column_value: Option, - num_column_nulls: u64, - column_distinct_count: Option, + column_metrics: ColumnMetrics, /// The order of encodings within the generated metadata does not impact its meaning, /// but we use a BTreeSet so that the output is deterministic encodings: BTreeSet, - // Reused buffers def_levels_sink: Vec, rep_levels_sink: Vec, @@ -226,32 +231,38 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { codec, compressor, encoder, - num_buffered_values: 0, - num_buffered_rows: 0, - num_page_nulls: 0, - total_bytes_written: 0, - total_rows_written: 0, - total_uncompressed_size: 0, - total_compressed_size: 0, - total_num_values: 0, - dictionary_page_offset: None, - data_page_offset: None, def_levels_sink: vec![], rep_levels_sink: vec![], data_pages: VecDeque::new(), - min_column_value: None, - max_column_value: None, - num_column_nulls: 0, - column_distinct_count: None, + page_metrics: PageMetrics { + num_buffered_values: 0, + num_buffered_rows: 0, + num_page_nulls: 0, + }, + column_metrics: ColumnMetrics { + total_bytes_written: 0, + total_rows_written: 0, + total_uncompressed_size: 0, + total_compressed_size: 0, + total_num_values: 0, + dictionary_page_offset: None, + data_page_offset: None, + min_column_value: None, + max_column_value: None, + num_column_nulls: 0, + column_distinct_count: None, + }, column_index_builder: ColumnIndexBuilder::new(), offset_index_builder: OffsetIndexBuilder::new(), encodings, } } - fn write_batch_internal( + #[allow(clippy::too_many_arguments)] + pub(crate) fn write_batch_internal( &mut self, values: &E::Values, + value_indices: Option<&[usize]>, def_levels: Option<&[i16]>, rep_levels: Option<&[i16]>, min: Option<&E::T>, @@ -283,16 +294,33 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if self.statistics_enabled == EnabledStatistics::Chunk { match (min, max) { (Some(min), Some(max)) => { - update_min(&self.descr, min, &mut self.min_column_value); - update_max(&self.descr, max, &mut self.max_column_value); + update_min( + &self.descr, + min, + &mut self.column_metrics.min_column_value, + ); + update_max( + &self.descr, + max, + &mut self.column_metrics.max_column_value, + ); } (None, Some(_)) | (Some(_), None) => { panic!("min/max should be both set or both None") } (None, None) => { - if let Some((min, max)) = values.min_max(&self.descr) { - update_min(&self.descr, min, &mut self.min_column_value); - update_max(&self.descr, max, &mut self.max_column_value); + if let Some((min, max)) = self.encoder.min_max(values, value_indices) + { + update_min( + &self.descr, + &min, + &mut self.column_metrics.min_column_value, + ); + update_max( + &self.descr, + &max, + &mut self.column_metrics.max_column_value, + ); } } }; @@ -300,9 +328,9 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // We can only set the distinct count if there are no other writes if self.encoder.num_values() == 0 { - self.column_distinct_count = distinct_count; + self.column_metrics.column_distinct_count = distinct_count; } else { - self.column_distinct_count = None; + self.column_metrics.column_distinct_count = None; } let mut values_offset = 0; @@ -311,6 +339,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { values_offset += self.write_mini_batch( values, values_offset, + value_indices, write_batch_size, def_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), rep_levels.map(|lv| &lv[levels_offset..levels_offset + write_batch_size]), @@ -321,6 +350,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { values_offset += self.write_mini_batch( values, values_offset, + value_indices, num_levels - levels_offset, def_levels.map(|lv| &lv[levels_offset..]), rep_levels.map(|lv| &lv[levels_offset..]), @@ -348,7 +378,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { def_levels: Option<&[i16]>, rep_levels: Option<&[i16]>, ) -> Result { - self.write_batch_internal(values, def_levels, rep_levels, None, None, None) + self.write_batch_internal(values, None, def_levels, rep_levels, None, None, None) } /// Writer may optionally provide pre-calculated statistics for use when computing @@ -369,6 +399,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { ) -> Result { self.write_batch_internal( values, + None, def_levels, rep_levels, min, @@ -380,19 +411,19 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Returns total number of bytes written by this column writer so far. /// This value is also returned when column writer is closed. pub fn get_total_bytes_written(&self) -> u64 { - self.total_bytes_written + self.column_metrics.total_bytes_written } /// Returns total number of rows written by this column writer so far. /// This value is also returned when column writer is closed. pub fn get_total_rows_written(&self) -> u64 { - self.total_rows_written + self.column_metrics.total_rows_written } /// Finalises writes and closes the column writer. /// Returns total bytes written, total rows written and column chunk metadata. pub fn close(mut self) -> Result { - if self.num_buffered_values > 0 { + if self.page_metrics.num_buffered_values > 0 { self.add_data_page()?; } if self.encoder.has_dictionary() { @@ -412,8 +443,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { }; Ok(( - self.total_bytes_written, - self.total_rows_written, + self.column_metrics.total_bytes_written, + self.column_metrics.total_rows_written, metadata, column_index, offset_index, @@ -427,6 +458,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { &mut self, values: &E::Values, values_offset: usize, + value_indices: Option<&[usize]>, num_levels: usize, def_levels: Option<&[i16]>, rep_levels: Option<&[i16]>, @@ -458,7 +490,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { values_to_write += 1; } else { // We must always compute this as it is used to populate v2 pages - self.num_page_nulls += 1 + self.page_metrics.num_page_nulls += 1 } } @@ -480,18 +512,25 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // Count the occasions where we start a new row for &level in levels { - self.num_buffered_rows += (level == 0) as u32 + self.page_metrics.num_buffered_rows += (level == 0) as u32 } self.rep_levels_sink.extend_from_slice(levels); } else { // Each value is exactly one row. // Equals to the number of values, we count nulls as well. - self.num_buffered_rows += num_levels as u32; + self.page_metrics.num_buffered_rows += num_levels as u32; + } + + match value_indices { + Some(indices) => { + let indices = &indices[values_offset..values_offset + values_to_write]; + self.encoder.write_gather(values, indices)?; + } + None => self.encoder.write(values, values_offset, values_to_write)?, } - self.encoder.write(values, values_offset, values_to_write)?; - self.num_buffered_values += num_levels as u32; + self.page_metrics.num_buffered_values += num_levels as u32; if self.should_add_data_page() { self.add_data_page()?; @@ -534,7 +573,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Prepares and writes dictionary and all data pages into page writer. fn dict_fallback(&mut self) -> Result<()> { // At this point we know that we need to fall back. - if self.num_buffered_values > 0 { + if self.page_metrics.num_buffered_values > 0 { self.add_data_page()?; } self.write_dictionary_page()?; @@ -545,7 +584,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Update the column index and offset index when adding the data page fn update_column_offset_index(&mut self, page_statistics: &Option) { // update the column index - let null_page = (self.num_buffered_rows as u64) == self.num_page_nulls; + let null_page = (self.page_metrics.num_buffered_rows as u64) + == self.page_metrics.num_page_nulls; // a page contains only null values, // and writers have to set the corresponding entries in min_values and max_values to byte[0] if null_page && self.column_index_builder.valid() { @@ -553,7 +593,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { null_page, &[0; 1], &[0; 1], - self.num_page_nulls as i64, + self.page_metrics.num_page_nulls as i64, ); } else if self.column_index_builder.valid() { // from page statistics @@ -567,7 +607,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { null_page, stat.min_bytes(), stat.max_bytes(), - self.num_page_nulls as i64, + self.page_metrics.num_page_nulls as i64, ); } } @@ -575,7 +615,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { // update the offset index self.offset_index_builder - .append_row_count(self.num_buffered_rows as i64); + .append_row_count(self.page_metrics.num_buffered_rows as i64); } /// Adds data page. @@ -587,17 +627,17 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let max_def_level = self.descr.max_def_level(); let max_rep_level = self.descr.max_rep_level(); - self.num_column_nulls += self.num_page_nulls; + self.column_metrics.num_column_nulls += self.page_metrics.num_page_nulls; let page_statistics = match (values_data.min_value, values_data.max_value) { (Some(min), Some(max)) => { - update_min(&self.descr, &min, &mut self.min_column_value); - update_max(&self.descr, &max, &mut self.max_column_value); + update_min(&self.descr, &min, &mut self.column_metrics.min_column_value); + update_max(&self.descr, &max, &mut self.column_metrics.max_column_value); Some(Statistics::new( Some(min), Some(max), None, - self.num_page_nulls, + self.page_metrics.num_page_nulls, false, )) } @@ -617,7 +657,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Encoding::RLE, &self.rep_levels_sink[..], max_rep_level, - )?[..], + )[..], ); } @@ -627,7 +667,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { Encoding::RLE, &self.def_levels_sink[..], max_def_level, - )?[..], + )[..], ); } @@ -642,7 +682,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let data_page = Page::DataPage { buf: ByteBufferPtr::new(buffer), - num_values: self.num_buffered_values, + num_values: self.page_metrics.num_buffered_values, encoding: values_data.encoding, def_level_encoding: Encoding::RLE, rep_level_encoding: Encoding::RLE, @@ -658,14 +698,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if max_rep_level > 0 { let levels = - self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level)?; + self.encode_levels_v2(&self.rep_levels_sink[..], max_rep_level); rep_levels_byte_len = levels.len(); buffer.extend_from_slice(&levels[..]); } if max_def_level > 0 { let levels = - self.encode_levels_v2(&self.def_levels_sink[..], max_def_level)?; + self.encode_levels_v2(&self.def_levels_sink[..], max_def_level); def_levels_byte_len = levels.len(); buffer.extend_from_slice(&levels[..]); } @@ -683,10 +723,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { let data_page = Page::DataPageV2 { buf: ByteBufferPtr::new(buffer), - num_values: self.num_buffered_values, + num_values: self.page_metrics.num_buffered_values, encoding: values_data.encoding, - num_nulls: self.num_page_nulls as u32, - num_rows: self.num_buffered_rows, + num_nulls: self.page_metrics.num_page_nulls as u32, + num_rows: self.page_metrics.num_buffered_rows, def_levels_byte_len: def_levels_byte_len as u32, rep_levels_byte_len: rep_levels_byte_len as u32, is_compressed: self.compressor.is_some(), @@ -705,14 +745,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { } // Update total number of rows. - self.total_rows_written += self.num_buffered_rows as u64; + self.column_metrics.total_rows_written += + self.page_metrics.num_buffered_rows as u64; // Reset state. self.rep_levels_sink.clear(); self.def_levels_sink.clear(); - self.num_buffered_values = 0; - self.num_buffered_rows = 0; - self.num_page_nulls = 0; + self.page_metrics = PageMetrics::default(); Ok(()) } @@ -722,7 +761,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { #[inline] fn flush_data_pages(&mut self) -> Result<()> { // Write all outstanding data to a new page. - if self.num_buffered_values > 0 { + if self.page_metrics.num_buffered_values > 0 { self.add_data_page()?; } @@ -735,12 +774,13 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Assembles and writes column chunk metadata. fn write_column_metadata(&mut self) -> Result { - let total_compressed_size = self.total_compressed_size as i64; - let total_uncompressed_size = self.total_uncompressed_size as i64; - let num_values = self.total_num_values as i64; - let dict_page_offset = self.dictionary_page_offset.map(|v| v as i64); + let total_compressed_size = self.column_metrics.total_compressed_size as i64; + let total_uncompressed_size = self.column_metrics.total_uncompressed_size as i64; + let num_values = self.column_metrics.total_num_values as i64; + let dict_page_offset = + self.column_metrics.dictionary_page_offset.map(|v| v as i64); // If data page offset is not set, then no pages have been written - let data_page_offset = self.data_page_offset.unwrap_or(0) as i64; + let data_page_offset = self.column_metrics.data_page_offset.unwrap_or(0) as i64; let file_offset = match dict_page_offset { Some(dict_offset) => dict_offset + total_compressed_size, @@ -759,10 +799,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { if self.statistics_enabled != EnabledStatistics::None { let statistics = Statistics::new( - self.min_column_value.clone(), - self.max_column_value.clone(), - self.column_distinct_count, - self.num_column_nulls, + self.column_metrics.min_column_value.clone(), + self.column_metrics.max_column_value.clone(), + self.column_metrics.column_distinct_count, + self.column_metrics.num_column_nulls, false, ); builder = builder.set_statistics(statistics); @@ -781,20 +821,18 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { encoding: Encoding, levels: &[i16], max_level: i16, - ) -> Result> { - let size = max_buffer_size(encoding, max_level, levels.len()); - let mut encoder = LevelEncoder::v1(encoding, max_level, vec![0; size]); - encoder.put(levels)?; + ) -> Vec { + let mut encoder = LevelEncoder::v1(encoding, max_level, levels.len()); + encoder.put(levels); encoder.consume() } /// Encodes definition or repetition levels for Data Page v2. /// Encoding is always RLE. #[inline] - fn encode_levels_v2(&self, levels: &[i16], max_level: i16) -> Result> { - let size = max_buffer_size(Encoding::RLE, max_level, levels.len()); - let mut encoder = LevelEncoder::v2(max_level, vec![0; size]); - encoder.put(levels)?; + fn encode_levels_v2(&self, levels: &[i16], max_level: i16) -> Vec { + let mut encoder = LevelEncoder::v2(max_level, levels.len()); + encoder.put(levels); encoder.consume() } @@ -849,33 +887,27 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> { /// Updates column writer metrics with each page metadata. #[inline] fn update_metrics_for_page(&mut self, page_spec: PageWriteSpec) { - self.total_uncompressed_size += page_spec.uncompressed_size as u64; - self.total_compressed_size += page_spec.compressed_size as u64; - self.total_num_values += page_spec.num_values as u64; - self.total_bytes_written += page_spec.bytes_written; + self.column_metrics.total_uncompressed_size += page_spec.uncompressed_size as u64; + self.column_metrics.total_compressed_size += page_spec.compressed_size as u64; + self.column_metrics.total_num_values += page_spec.num_values as u64; + self.column_metrics.total_bytes_written += page_spec.bytes_written; match page_spec.page_type { PageType::DATA_PAGE | PageType::DATA_PAGE_V2 => { - if self.data_page_offset.is_none() { - self.data_page_offset = Some(page_spec.offset); + if self.column_metrics.data_page_offset.is_none() { + self.column_metrics.data_page_offset = Some(page_spec.offset); } } PageType::DICTIONARY_PAGE => { assert!( - self.dictionary_page_offset.is_none(), + self.column_metrics.dictionary_page_offset.is_none(), "Dictionary offset is already set" ); - self.dictionary_page_offset = Some(page_spec.offset); + self.column_metrics.dictionary_page_offset = Some(page_spec.offset); } _ => {} } } - - /// Returns reference to the underlying page writer. - /// This method is intended to use in tests only. - fn get_page_writer_ref(&self) -> &dyn PageWriter { - self.page_writer.as_ref() - } } fn update_min( @@ -1064,7 +1096,7 @@ mod tests { writer::SerializedPageWriter, }; use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; - use crate::util::{io::FileSource, test_common::random_numbers_range}; + use crate::util::{io::FileSource, test_common::rand_gen::random_numbers_range}; use super::*; @@ -2371,20 +2403,6 @@ mod tests { get_typed_column_writer::(column_writer) } - /// Returns decimals column reader. - fn get_test_decimals_column_reader( - page_reader: Box, - max_def_level: i16, - max_rep_level: i16, - ) -> ColumnReaderImpl { - let descr = Arc::new(get_test_decimals_column_descr::( - max_def_level, - max_rep_level, - )); - let column_reader = get_column_reader(descr, page_reader); - get_typed_column_reader::(column_reader) - } - /// Returns descriptor for Decimal type with primitive column. fn get_test_decimals_column_descr( max_def_level: i16, @@ -2419,20 +2437,6 @@ mod tests { get_typed_column_writer::(column_writer) } - /// Returns column reader for UINT32 Column provided as ConvertedType only - fn get_test_unsigned_int_given_as_converted_column_reader( - page_reader: Box, - max_def_level: i16, - max_rep_level: i16, - ) -> ColumnReaderImpl { - let descr = Arc::new(get_test_converted_type_unsigned_integer_column_descr::( - max_def_level, - max_rep_level, - )); - let column_reader = get_column_reader(descr, page_reader); - get_typed_column_reader::(column_reader) - } - /// Returns column descriptor for UINT32 Column provided as ConvertedType only fn get_test_converted_type_unsigned_integer_column_descr( max_def_level: i16, diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs index a5e49360a28a..ee5141cbe140 100644 --- a/parquet/src/compression.rs +++ b/parquet/src/compression.rs @@ -329,7 +329,7 @@ pub use zstd_codec::*; mod tests { use super::*; - use crate::util::test_common::*; + use crate::util::test_common::rand_gen::random_bytes; fn test_roundtrip(c: CodecType, data: &[u8]) { let mut c1 = create_codec(c).unwrap().unwrap(); diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 1d0b5b231c6b..7870ca36a6d4 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -565,7 +565,7 @@ impl AsBytes for str { pub(crate) mod private { use crate::encodings::decoding::PlainDecoderDetails; - use crate::util::bit_util::{round_upto_power_of_2, BitReader, BitWriter}; + use crate::util::bit_util::{read_num_bytes, BitReader, BitWriter}; use crate::util::memory::ByteBufferPtr; use crate::basic::Type; @@ -574,8 +574,6 @@ pub(crate) mod private { use super::{ParquetError, Result, SliceAsBytes}; - pub type BitIndex = u64; - /// Sealed trait to start to remove specialisation from implementations /// /// This is done to force the associated value type to be unimplementable outside of this @@ -658,20 +656,8 @@ pub(crate) mod private { _: &mut W, bit_writer: &mut BitWriter, ) -> Result<()> { - if bit_writer.bytes_written() + values.len() / 8 >= bit_writer.capacity() { - let bits_available = - (bit_writer.capacity() - bit_writer.bytes_written()) * 8; - let bits_needed = values.len() - bits_available; - let bytes_needed = (bits_needed + 7) / 8; - let bytes_needed = round_upto_power_of_2(bytes_needed, 256); - bit_writer.extend(bytes_needed); - } for value in values { - if !bit_writer.put_value(*value as u64, 1) { - return Err(ParquetError::EOF( - "unable to put boolean value".to_string(), - )); - } + bit_writer.put_value(*value as u64, 1) } Ok(()) } @@ -722,19 +708,6 @@ pub(crate) mod private { } } - /// Hopelessly unsafe function that emulates `num::as_ne_bytes` - /// - /// It is not recommended to use this outside of this private module as, while it - /// _should_ work for primitive values, it is little better than a transmutation - /// and can act as a backdoor into mis-interpreting types as arbitary byte slices - #[inline] - fn as_raw<'a, T>(value: *const T) -> &'a [u8] { - unsafe { - let value = value as *const u8; - std::slice::from_raw_parts(value, std::mem::size_of::()) - } - } - macro_rules! impl_from_raw { ($ty: ty, $physical_ty: expr, $self: ident => $as_i64: block) => { impl ParquetValueType for $ty { @@ -919,21 +892,6 @@ pub(crate) mod private { } } - // TODO - Why does macro importing fail? - /// Reads `$size` of bytes from `$src`, and reinterprets them as type `$ty`, in - /// little-endian order. `$ty` must implement the `Default` trait. Otherwise this won't - /// compile. - /// This is copied and modified from byteorder crate. - macro_rules! read_num_bytes { - ($ty:ty, $size:expr, $src:expr) => {{ - assert!($size <= $src.len()); - let mut buffer = - <$ty as $crate::util::bit_util::FromBytes>::Buffer::default(); - buffer.as_mut()[..$size].copy_from_slice(&$src[..$size]); - <$ty>::from_ne_bytes(buffer) - }}; - } - impl ParquetValueType for super::ByteArray { const PHYSICAL_TYPE: Type = Type::BYTE_ARRAY; @@ -973,9 +931,9 @@ pub(crate) mod private { .as_mut() .expect("set_data should have been called"); let num_values = std::cmp::min(buffer.len(), decoder.num_values); - for i in 0..num_values { + for val_array in buffer.iter_mut().take(num_values) { let len: usize = - read_num_bytes!(u32, 4, data.start_from(decoder.start).as_ref()) + read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; decoder.start += std::mem::size_of::(); @@ -983,7 +941,7 @@ pub(crate) mod private { return Err(eof_err!("Not enough bytes to decode")); } - let val: &mut Self = buffer[i].as_mut_any().downcast_mut().unwrap(); + let val: &mut Self = val_array.as_mut_any().downcast_mut().unwrap(); val.set_data(data.range(decoder.start, len)); decoder.start += len; @@ -1002,7 +960,7 @@ pub(crate) mod private { for _ in 0..num_values { let len: usize = - read_num_bytes!(u32, 4, data.start_from(decoder.start).as_ref()) + read_num_bytes::(4, data.start_from(decoder.start).as_ref()) as usize; decoder.start += std::mem::size_of::() + len; } diff --git a/parquet/src/encodings/decoding.rs b/parquet/src/encodings/decoding.rs index 58aa592d1424..86941ffe0eeb 100644 --- a/parquet/src/encodings/decoding.rs +++ b/parquet/src/encodings/decoding.rs @@ -322,6 +322,12 @@ pub struct DictDecoder { num_values: usize, } +impl Default for DictDecoder { + fn default() -> Self { + Self::new() + } +} + impl DictDecoder { /// Creates new dictionary decoder. pub fn new() -> Self { @@ -394,6 +400,12 @@ pub struct RleValueDecoder { _phantom: PhantomData, } +impl Default for RleValueDecoder { + fn default() -> Self { + Self::new() + } +} + impl RleValueDecoder { pub fn new() -> Self { Self { @@ -412,7 +424,7 @@ impl Decoder for RleValueDecoder { // We still need to remove prefix of i32 from the stream. const I32_SIZE: usize = mem::size_of::(); - let data_size = read_num_bytes!(i32, I32_SIZE, data.as_ref()) as usize; + let data_size = bit_util::read_num_bytes::(I32_SIZE, data.as_ref()) as usize; self.decoder = RleDecoder::new(1); self.decoder.set_data(data.range(I32_SIZE, data_size)); self.values_left = num_values; @@ -485,6 +497,15 @@ pub struct DeltaBitPackDecoder { last_value: T::T, } +impl Default for DeltaBitPackDecoder +where + T::T: Default + FromPrimitive + WrappingAdd + Copy, +{ + fn default() -> Self { + Self::new() + } +} + impl DeltaBitPackDecoder where T::T: Default + FromPrimitive + WrappingAdd + Copy, @@ -706,8 +727,6 @@ where Ok(to_read) } - - fn values_left(&self) -> usize { self.values_left } @@ -717,8 +736,61 @@ where } fn skip(&mut self, num_values: usize) -> Result { - let mut buffer = vec![T::T::default(); num_values]; - self.get(&mut buffer) + let mut skip = 0; + let to_skip = num_values.min(self.values_left); + if to_skip == 0 { + return Ok(0); + } + + // try to consume first value in header. + if let Some(value) = self.first_value.take() { + self.last_value = value; + skip += 1; + self.values_left -= 1; + } + + let mini_block_batch_size = match T::T::PHYSICAL_TYPE { + Type::INT32 => 32, + Type::INT64 => 64, + _ => unreachable!(), + }; + + let mut skip_buffer = vec![T::T::default(); mini_block_batch_size]; + while skip < to_skip { + if self.mini_block_remaining == 0 { + self.next_mini_block()?; + } + + let bit_width = self.mini_block_bit_widths[self.mini_block_idx] as usize; + let mini_block_to_skip = self.mini_block_remaining.min(to_skip - skip); + let mini_block_should_skip = mini_block_to_skip; + + let skip_count = self + .bit_reader + .get_batch(&mut skip_buffer[0..mini_block_to_skip], bit_width); + + if skip_count != mini_block_to_skip { + return Err(general_err!( + "Expected to skip {} values from mini block got {}.", + mini_block_batch_size, + skip_count + )); + } + + for v in &mut skip_buffer[0..skip_count] { + *v = v + .wrapping_add(&self.min_delta) + .wrapping_add(&self.last_value); + + self.last_value = *v; + } + + skip += mini_block_should_skip; + self.mini_block_remaining -= mini_block_should_skip; + self.values_left -= mini_block_should_skip; + } + + Ok(to_skip) } } @@ -751,6 +823,12 @@ pub struct DeltaLengthByteArrayDecoder { _phantom: PhantomData, } +impl Default for DeltaLengthByteArrayDecoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaLengthByteArrayDecoder { /// Creates new delta length byte array decoder. pub fn new() -> Self { @@ -829,7 +907,10 @@ impl Decoder for DeltaLengthByteArrayDecoder { Type::BYTE_ARRAY => { let num_values = cmp::min(num_values, self.num_values); - let next_offset: i32 = self.lengths[self.current_idx..self.current_idx + num_values].iter().sum(); + let next_offset: i32 = self.lengths + [self.current_idx..self.current_idx + num_values] + .iter() + .sum(); self.current_idx += num_values; self.offset += next_offset as usize; @@ -837,8 +918,9 @@ impl Decoder for DeltaLengthByteArrayDecoder { self.num_values -= num_values; Ok(num_values) } - other_type => Err(general_err!( - "DeltaLengthByteArrayDecoder not support {}, only support byte array", other_type + other_type => Err(general_err!( + "DeltaLengthByteArrayDecoder not support {}, only support byte array", + other_type )), } } @@ -874,6 +956,12 @@ pub struct DeltaByteArrayDecoder { _phantom: PhantomData, } +impl Default for DeltaByteArrayDecoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaByteArrayDecoder { /// Creates new delta byte array decoder. pub fn new() -> Self { @@ -990,7 +1078,7 @@ mod tests { use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; - use crate::util::{bit_util::set_array_bit, test_common::RandGen}; + use crate::util::test_common::rand_gen::RandGen; #[test] fn test_get_decoders() { @@ -1068,13 +1156,7 @@ mod tests { fn test_plain_skip_all_int32() { let data = vec![42, 18, 52]; let data_bytes = Int32Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 5, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1096,7 +1178,6 @@ mod tests { ); } - #[test] fn test_plain_decode_int64() { let data = vec![42, 18, 52]; @@ -1128,16 +1209,9 @@ mod tests { fn test_plain_skip_all_int64() { let data = vec![42, 18, 52]; let data_bytes = Int64Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 3, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 3, -1, &[]); } - #[test] fn test_plain_decode_float() { let data = vec![3.14, 2.414, 12.51]; @@ -1169,13 +1243,7 @@ mod tests { fn test_plain_skip_all_float() { let data = vec![3.14, 2.414, 12.51]; let data_bytes = FloatType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 4, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 4, -1, &[]); } #[test] @@ -1195,13 +1263,7 @@ mod tests { fn test_plain_skip_all_double() { let data = vec![3.14f64, 2.414f64, 12.51f64]; let data_bytes = DoubleType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 3, - 5, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 3, 5, -1, &[]); } #[test] @@ -1261,13 +1323,7 @@ mod tests { data[2].set_data(10, 20, 30); data[3].set_data(40, 50, 60); let data_bytes = Int96Type::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 4, - 8, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 4, 8, -1, &[]); } #[test] @@ -1307,16 +1363,9 @@ mod tests { false, true, false, false, true, false, true, true, false, true, ]; let data_bytes = BoolType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 10, - 20, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 10, 20, -1, &[]); } - #[test] fn test_plain_decode_byte_array() { let mut data = vec![ByteArray::new(); 2]; @@ -1354,13 +1403,7 @@ mod tests { data[0].set_data(ByteBufferPtr::new(String::from("hello").into_bytes())); data[1].set_data(ByteBufferPtr::new(String::from("parquet").into_bytes())); let data_bytes = ByteArrayType::to_byte_array(&data[..]); - test_plain_skip::( - ByteBufferPtr::new(data_bytes), - 2, - 2, - -1, - &[], - ); + test_plain_skip::(ByteBufferPtr::new(data_bytes), 2, 2, -1, &[]); } #[test] @@ -1587,7 +1630,6 @@ mod tests { ]; test_skip::(block_data.clone(), Encoding::DELTA_BINARY_PACKED, 5); test_skip::(block_data, Encoding::DELTA_BINARY_PACKED, 100); - } #[test] @@ -1833,8 +1875,7 @@ mod tests { let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); // Encode data - let mut encoder = - get_encoder::(col_descr.clone(), encoding).expect("get encoder"); + let mut encoder = get_encoder::(encoding).expect("get encoder"); for v in &data[..] { encoder.put(&v[..]).expect("ok to encode"); @@ -1867,17 +1908,14 @@ mod tests { let col_descr = create_test_col_desc_ptr(-1, T::get_physical_type()); // Encode data - let mut encoder = - get_encoder::(col_descr.clone(), encoding).expect("get encoder"); + let mut encoder = get_encoder::(encoding).expect("get encoder"); encoder.put(&data).expect("ok to encode"); let bytes = encoder.flush_buffer().expect("ok to flush buffer"); let mut decoder = get_decoder::(col_descr, encoding).expect("get decoder"); - decoder - .set_data(bytes, data.len()) - .expect("ok to set data"); + decoder.set_data(bytes, data.len()).expect("ok to set data"); if skip >= data.len() { let skipped = decoder.skip(skip).expect("ok to skip"); @@ -1894,7 +1932,7 @@ mod tests { let expected = &data[skip..]; let mut buffer = vec![T::T::default(); remaining]; let fetched = decoder.get(&mut buffer).expect("ok to decode"); - assert_eq!(remaining,fetched); + assert_eq!(remaining, fetched); assert_eq!(&buffer, expected); } } @@ -1966,7 +2004,7 @@ mod tests { v.push(0); } if *item { - set_array_bit(&mut v[..], i); + v[i / 8] |= 1 << (i % 8); } } v diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index 7bf983254666..18deba65e687 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -20,15 +20,14 @@ use crate::basic::{Encoding, Type}; use crate::data_type::private::ParquetValueType; -use crate::data_type::{AsBytes, DataType}; +use crate::data_type::DataType; use crate::encodings::encoding::{Encoder, PlainEncoder}; use crate::encodings::rle::RleEncoder; -use crate::errors::{ParquetError, Result}; +use crate::errors::Result; use crate::schema::types::ColumnDescPtr; use crate::util::bit_util::num_required_bits; use crate::util::interner::{Interner, Storage}; use crate::util::memory::ByteBufferPtr; -use std::io::Write; #[derive(Debug)] struct KeyStorage { @@ -74,9 +73,6 @@ impl Storage for KeyStorage { /// (max bit width = 32), followed by the values encoded using RLE/Bit packed described /// above (with the given bit width). pub struct DictEncoder { - /// Descriptor for the column to be encoded. - desc: ColumnDescPtr, - interner: Interner>, /// The buffered indices @@ -93,7 +89,6 @@ impl DictEncoder { }; Self { - desc, interner: Interner::new(storage), indices: vec![], } @@ -118,7 +113,7 @@ impl DictEncoder { /// Writes out the dictionary values with PLAIN encoding in a byte buffer, and return /// the result. pub fn write_dict(&self) -> Result { - let mut plain_encoder = PlainEncoder::::new(self.desc.clone(), vec![]); + let mut plain_encoder = PlainEncoder::::new(); plain_encoder.put(&self.interner.storage().uniques)?; plain_encoder.flush_buffer() } @@ -127,19 +122,16 @@ impl DictEncoder { /// the result. pub fn write_indices(&mut self) -> Result { let buffer_len = self.estimated_data_encoded_size(); - let mut buffer = vec![0; buffer_len]; - buffer[0] = self.bit_width() as u8; + let mut buffer = Vec::with_capacity(buffer_len); + buffer.push(self.bit_width() as u8); // Write bit width in the first byte - buffer.write_all((self.bit_width() as u8).as_bytes())?; - let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer, 1); + let mut encoder = RleEncoder::new_from_buf(self.bit_width(), buffer); for index in &self.indices { - if !encoder.put(*index as u64)? { - return Err(general_err!("Encoder doesn't have enough space")); - } + encoder.put(*index as u64) } self.indices.clear(); - Ok(ByteBufferPtr::new(encoder.consume()?)) + Ok(ByteBufferPtr::new(encoder.consume())) } fn put_one(&mut self, value: &T::T) { @@ -148,12 +140,7 @@ impl DictEncoder { #[inline] fn bit_width(&self) -> u8 { - let num_entries = self.num_entries(); - if num_entries <= 1 { - num_entries as u8 - } else { - num_required_bits(num_entries as u64 - 1) - } + num_required_bits(self.num_entries().saturating_sub(1) as u64) } } diff --git a/parquet/src/encodings/encoding/mod.rs b/parquet/src/encodings/encoding/mod.rs index 5cb94b7c0aeb..050f1b9f8a63 100644 --- a/parquet/src/encodings/encoding/mod.rs +++ b/parquet/src/encodings/encoding/mod.rs @@ -24,7 +24,6 @@ use crate::data_type::private::ParquetValueType; use crate::data_type::*; use crate::encodings::rle::RleEncoder; use crate::errors::{ParquetError, Result}; -use crate::schema::types::ColumnDescPtr; use crate::util::{ bit_util::{self, num_required_bits, BitWriter}, memory::ByteBufferPtr, @@ -76,12 +75,9 @@ pub trait Encoder { /// Gets a encoder for the particular data type `T` and encoding `encoding`. Memory usage /// for the encoder instance is tracked by `mem_tracker`. -pub fn get_encoder( - desc: ColumnDescPtr, - encoding: Encoding, -) -> Result>> { +pub fn get_encoder(encoding: Encoding) -> Result>> { let encoder: Box> = match encoding { - Encoding::PLAIN => Box::new(PlainEncoder::new(desc, vec![])), + Encoding::PLAIN => Box::new(PlainEncoder::new()), Encoding::RLE_DICTIONARY | Encoding::PLAIN_DICTIONARY => { return Err(general_err!( "Cannot initialize this encoding through this function" @@ -113,17 +109,21 @@ pub fn get_encoder( pub struct PlainEncoder { buffer: Vec, bit_writer: BitWriter, - desc: ColumnDescPtr, _phantom: PhantomData, } +impl Default for PlainEncoder { + fn default() -> Self { + Self::new() + } +} + impl PlainEncoder { /// Creates new plain encoder. - pub fn new(desc: ColumnDescPtr, buffer: Vec) -> Self { + pub fn new() -> Self { Self { - buffer, + buffer: vec![], bit_writer: BitWriter::new(256), - desc, _phantom: PhantomData, } } @@ -171,6 +171,12 @@ pub struct RleValueEncoder { _phantom: PhantomData, } +impl Default for RleValueEncoder { + fn default() -> Self { + Self::new() + } +} + impl RleValueEncoder { /// Creates new rle value encoder. pub fn new() -> Self { @@ -186,15 +192,16 @@ impl Encoder for RleValueEncoder { fn put(&mut self, values: &[T::T]) -> Result<()> { ensure_phys_ty!(Type::BOOLEAN, "RleValueEncoder only supports BoolType"); - if self.encoder.is_none() { - self.encoder = Some(RleEncoder::new(1, DEFAULT_RLE_BUFFER_LEN)); - } - let rle_encoder = self.encoder.as_mut().unwrap(); + let rle_encoder = self.encoder.get_or_insert_with(|| { + let mut buffer = Vec::with_capacity(DEFAULT_RLE_BUFFER_LEN); + // Reserve space for length + buffer.extend_from_slice(&[0; 4]); + RleEncoder::new_from_buf(1, buffer) + }); + for value in values { let value = value.as_u64()?; - if !rle_encoder.put(value)? { - return Err(general_err!("RLE buffer is full")); - } + rle_encoder.put(value) } Ok(()) } @@ -220,25 +227,18 @@ impl Encoder for RleValueEncoder { ensure_phys_ty!(Type::BOOLEAN, "RleValueEncoder only supports BoolType"); let rle_encoder = self .encoder - .as_mut() + .take() .expect("RLE value encoder is not initialized"); // Flush all encoder buffers and raw values - let encoded_data = { - let buf = rle_encoder.flush_buffer()?; - - // Note that buf does not have any offset, all data is encoded bytes - let len = (buf.len() as i32).to_le(); - let len_bytes = len.as_bytes(); - let mut encoded_data = vec![]; - encoded_data.extend_from_slice(len_bytes); - encoded_data.extend_from_slice(buf); - encoded_data - }; - // Reset rle encoder for the next batch - rle_encoder.clear(); + let mut buf = rle_encoder.consume(); + assert!(buf.len() > 4, "should have had padding inserted"); - Ok(ByteBufferPtr::new(encoded_data)) + // Note that buf does not have any offset, all data is encoded bytes + let len = (buf.len() - 4) as i32; + buf[..4].copy_from_slice(&len.to_le_bytes()); + + Ok(ByteBufferPtr::new(buf)) } } @@ -247,7 +247,6 @@ impl Encoder for RleValueEncoder { const MAX_PAGE_HEADER_WRITER_SIZE: usize = 32; const MAX_BIT_WRITER_SIZE: usize = 10 * 1024 * 1024; -const DEFAULT_BLOCK_SIZE: usize = 128; const DEFAULT_NUM_MINI_BLOCKS: usize = 4; /// Delta bit packed encoder. @@ -287,15 +286,28 @@ pub struct DeltaBitPackEncoder { _phantom: PhantomData, } +impl Default for DeltaBitPackEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaBitPackEncoder { /// Creates new delta bit packed encoder. pub fn new() -> Self { - let block_size = DEFAULT_BLOCK_SIZE; - let num_mini_blocks = DEFAULT_NUM_MINI_BLOCKS; - let mini_block_size = block_size / num_mini_blocks; - assert!(mini_block_size % 8 == 0); Self::assert_supported_type(); + // Size miniblocks so that they can be efficiently decoded + let mini_block_size = match T::T::PHYSICAL_TYPE { + Type::INT32 => 32, + Type::INT64 => 64, + _ => unreachable!(), + }; + + let num_mini_blocks = DEFAULT_NUM_MINI_BLOCKS; + let block_size = mini_block_size * num_mini_blocks; + assert_eq!(block_size % 128, 0); + DeltaBitPackEncoder { page_header_writer: BitWriter::new(MAX_PAGE_HEADER_WRITER_SIZE), bit_writer: BitWriter::new(MAX_BIT_WRITER_SIZE), @@ -346,7 +358,7 @@ impl DeltaBitPackEncoder { self.bit_writer.put_zigzag_vlq_int(min_delta); // Slice to store bit width for each mini block - let offset = self.bit_writer.skip(self.num_mini_blocks)?; + let offset = self.bit_writer.skip(self.num_mini_blocks); for i in 0..self.num_mini_blocks { // Find how many values we need to encode - either block size or whatever @@ -364,7 +376,7 @@ impl DeltaBitPackEncoder { } // Compute the max delta in current mini block - let mut max_delta = i64::min_value(); + let mut max_delta = i64::MIN; for j in 0..n { max_delta = cmp::max(max_delta, self.deltas[i * self.mini_block_size + j]); @@ -531,6 +543,12 @@ pub struct DeltaLengthByteArrayEncoder { _phantom: PhantomData, } +impl Default for DeltaLengthByteArrayEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaLengthByteArrayEncoder { /// Creates new delta length byte array encoder. pub fn new() -> Self { @@ -610,6 +628,12 @@ pub struct DeltaByteArrayEncoder { _phantom: PhantomData, } +impl Default for DeltaByteArrayEncoder { + fn default() -> Self { + Self::new() + } +} + impl DeltaByteArrayEncoder { /// Creates new delta byte array encoder. pub fn new() -> Self { @@ -705,7 +729,7 @@ mod tests { use crate::schema::types::{ ColumnDescPtr, ColumnDescriptor, ColumnPath, Type as SchemaType, }; - use crate::util::test_common::{random_bytes, RandGen}; + use crate::util::test_common::rand_gen::{random_bytes, RandGen}; const TEST_SET_SIZE: usize = 1024; @@ -847,7 +871,7 @@ mod tests { Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { Box::new(create_test_dict_encoder::(type_length)) } - _ => create_test_encoder::(type_length, encoding), + _ => create_test_encoder::(encoding), }; assert_eq!(encoder.estimated_data_encoded_size(), initial_size); @@ -873,7 +897,7 @@ mod tests { let mut values = vec![]; values.extend_from_slice(&[true; 16]); values.extend_from_slice(&[false; 16]); - run_test::(Encoding::RLE, -1, &values, 0, 2, 0); + run_test::(Encoding::RLE, -1, &values, 0, 6, 0); // DELTA_LENGTH_BYTE_ARRAY run_test::( @@ -900,7 +924,7 @@ mod tests { #[test] fn test_issue_47() { let mut encoder = - create_test_encoder::(0, Encoding::DELTA_BYTE_ARRAY); + create_test_encoder::(Encoding::DELTA_BYTE_ARRAY); let mut decoder = create_test_decoder::(0, Encoding::DELTA_BYTE_ARRAY); @@ -952,7 +976,7 @@ mod tests { impl> EncodingTester for T { fn test_internal(enc: Encoding, total: usize, type_length: i32) -> Result<()> { - let mut encoder = create_test_encoder::(type_length, enc); + let mut encoder = create_test_encoder::(enc); let mut decoder = create_test_decoder::(type_length, enc); let mut values = >::gen_vec(type_length, total); let mut result_data = vec![T::T::default(); total]; @@ -1054,8 +1078,7 @@ mod tests { encoding: Encoding, err: Option, ) { - let descr = create_test_col_desc_ptr(-1, T::get_physical_type()); - let encoder = get_encoder::(descr, encoding); + let encoder = get_encoder::(encoding); match err { Some(parquet_error) => { assert!(encoder.is_err()); @@ -1082,12 +1105,8 @@ mod tests { )) } - fn create_test_encoder( - type_len: i32, - enc: Encoding, - ) -> Box> { - let desc = create_test_col_desc_ptr(type_len, T::get_physical_type()); - get_encoder(desc, enc).unwrap() + fn create_test_encoder(enc: Encoding) -> Box> { + get_encoder(enc).unwrap() } fn create_test_decoder( diff --git a/parquet/src/encodings/levels.rs b/parquet/src/encodings/levels.rs index 28fb63881693..95384926ddba 100644 --- a/parquet/src/encodings/levels.rs +++ b/parquet/src/encodings/levels.rs @@ -21,9 +21,9 @@ use super::rle::{RleDecoder, RleEncoder}; use crate::basic::Encoding; use crate::data_type::AsBytes; -use crate::errors::{ParquetError, Result}; +use crate::errors::Result; use crate::util::{ - bit_util::{ceil, num_required_bits, BitReader, BitWriter}, + bit_util::{ceil, num_required_bits, read_num_bytes, BitReader, BitWriter}, memory::ByteBufferPtr, }; @@ -65,22 +65,21 @@ impl LevelEncoder { /// Used to encode levels for Data Page v1. /// /// Panics, if encoding is not supported. - pub fn v1(encoding: Encoding, max_level: i16, byte_buffer: Vec) -> Self { + pub fn v1(encoding: Encoding, max_level: i16, capacity: usize) -> Self { + let capacity_bytes = max_buffer_size(encoding, max_level, capacity); + let mut buffer = Vec::with_capacity(capacity_bytes); let bit_width = num_required_bits(max_level as u64); match encoding { - Encoding::RLE => LevelEncoder::Rle(RleEncoder::new_from_buf( - bit_width, - byte_buffer, - mem::size_of::(), - )), + Encoding::RLE => { + // Reserve space for length header + buffer.extend_from_slice(&[0; 4]); + LevelEncoder::Rle(RleEncoder::new_from_buf(bit_width, buffer)) + } Encoding::BIT_PACKED => { // Here we set full byte buffer without adjusting for num_buffered_values, // because byte buffer will already be allocated with size from // `max_buffer_size()` method. - LevelEncoder::BitPacked( - bit_width, - BitWriter::new_from_buf(byte_buffer, 0), - ) + LevelEncoder::BitPacked(bit_width, BitWriter::new_from_buf(buffer)) } _ => panic!("Unsupported encoding type {}", encoding), } @@ -88,59 +87,54 @@ impl LevelEncoder { /// Creates new level encoder based on RLE encoding. Used to encode Data Page v2 /// repetition and definition levels. - pub fn v2(max_level: i16, byte_buffer: Vec) -> Self { + pub fn v2(max_level: i16, capacity: usize) -> Self { + let capacity_bytes = max_buffer_size(Encoding::RLE, max_level, capacity); + let buffer = Vec::with_capacity(capacity_bytes); let bit_width = num_required_bits(max_level as u64); - LevelEncoder::RleV2(RleEncoder::new_from_buf(bit_width, byte_buffer, 0)) + LevelEncoder::RleV2(RleEncoder::new_from_buf(bit_width, buffer)) } /// Put/encode levels vector into this level encoder. /// Returns number of encoded values that are less than or equal to length of the /// input buffer. - /// - /// RLE and BIT_PACKED level encoders return Err() when internal buffer overflows or - /// flush fails. #[inline] - pub fn put(&mut self, buffer: &[i16]) -> Result { + pub fn put(&mut self, buffer: &[i16]) -> usize { let mut num_encoded = 0; match *self { LevelEncoder::Rle(ref mut encoder) | LevelEncoder::RleV2(ref mut encoder) => { for value in buffer { - if !encoder.put(*value as u64)? { - return Err(general_err!("RLE buffer is full")); - } + encoder.put(*value as u64); num_encoded += 1; } - encoder.flush()?; + encoder.flush(); } LevelEncoder::BitPacked(bit_width, ref mut encoder) => { for value in buffer { - if !encoder.put_value(*value as u64, bit_width as usize) { - return Err(general_err!("Not enough bytes left")); - } + encoder.put_value(*value as u64, bit_width as usize); num_encoded += 1; } encoder.flush(); } } - Ok(num_encoded) + num_encoded } /// Finalizes level encoder, flush all intermediate buffers and return resulting /// encoded buffer. Returned buffer is already truncated to encoded bytes only. #[inline] - pub fn consume(self) -> Result> { + pub fn consume(self) -> Vec { match self { LevelEncoder::Rle(encoder) => { - let mut encoded_data = encoder.consume()?; + let mut encoded_data = encoder.consume(); // Account for the buffer offset let encoded_len = encoded_data.len() - mem::size_of::(); let len = (encoded_len as i32).to_le(); let len_bytes = len.as_bytes(); encoded_data[0..len_bytes.len()].copy_from_slice(len_bytes); - Ok(encoded_data) + encoded_data } LevelEncoder::RleV2(encoder) => encoder.consume(), - LevelEncoder::BitPacked(_, encoder) => Ok(encoder.consume()), + LevelEncoder::BitPacked(_, encoder) => encoder.consume(), } } } @@ -148,12 +142,14 @@ impl LevelEncoder { /// Decoder for definition/repetition levels. /// Currently only supports RLE and BIT_PACKED encoding for Data Page v1 and /// RLE for Data Page v2. +#[allow(unused)] pub enum LevelDecoder { Rle(Option, RleDecoder), RleV2(Option, RleDecoder), BitPacked(Option, u8, BitReader), } +#[allow(unused)] impl LevelDecoder { /// Creates new level decoder based on encoding and max definition/repetition level. /// This method only initializes level decoder, `set_data` method must be called @@ -196,7 +192,7 @@ impl LevelDecoder { LevelDecoder::Rle(ref mut num_values, ref mut decoder) => { *num_values = Some(num_buffered_values); let i32_size = mem::size_of::(); - let data_size = read_num_bytes!(i32, i32_size, data.as_ref()) as usize; + let data_size = read_num_bytes::(i32_size, data.as_ref()) as usize; decoder.set_data(data.range(i32_size, data_size)); i32_size + data_size } @@ -280,17 +276,16 @@ impl LevelDecoder { mod tests { use super::*; - use crate::util::test_common::random_numbers_range; + use crate::util::test_common::rand_gen::random_numbers_range; fn test_internal_roundtrip(enc: Encoding, levels: &[i16], max_level: i16, v2: bool) { - let size = max_buffer_size(enc, max_level, levels.len()); let mut encoder = if v2 { - LevelEncoder::v2(max_level, vec![0; size]) + LevelEncoder::v2(max_level, levels.len()) } else { - LevelEncoder::v1(enc, max_level, vec![0; size]) + LevelEncoder::v1(enc, max_level, levels.len()) }; - encoder.put(levels).expect("put() should be OK"); - let encoded_levels = encoder.consume().expect("consume() should be OK"); + encoder.put(levels); + let encoded_levels = encoder.consume(); let byte_buf = ByteBufferPtr::new(encoded_levels); let mut decoder; @@ -315,14 +310,13 @@ mod tests { max_level: i16, v2: bool, ) { - let size = max_buffer_size(enc, max_level, levels.len()); let mut encoder = if v2 { - LevelEncoder::v2(max_level, vec![0; size]) + LevelEncoder::v2(max_level, levels.len()) } else { - LevelEncoder::v1(enc, max_level, vec![0; size]) + LevelEncoder::v1(enc, max_level, levels.len()) }; - encoder.put(levels).expect("put() should be OK"); - let encoded_levels = encoder.consume().expect("consume() should be OK"); + encoder.put(levels); + let encoded_levels = encoder.consume(); let byte_buf = ByteBufferPtr::new(encoded_levels); let mut decoder; @@ -363,15 +357,14 @@ mod tests { max_level: i16, v2: bool, ) { - let size = max_buffer_size(enc, max_level, levels.len()); let mut encoder = if v2 { - LevelEncoder::v2(max_level, vec![0; size]) + LevelEncoder::v2(max_level, levels.len()) } else { - LevelEncoder::v1(enc, max_level, vec![0; size]) + LevelEncoder::v1(enc, max_level, levels.len()) }; // Encode only one value - let num_encoded = encoder.put(&levels[0..1]).expect("put() should be OK"); - let encoded_levels = encoder.consume().expect("consume() should be OK"); + let num_encoded = encoder.put(&levels[0..1]); + let encoded_levels = encoder.consume(); assert_eq!(num_encoded, 1); let byte_buf = ByteBufferPtr::new(encoded_levels); @@ -391,33 +384,6 @@ mod tests { assert_eq!(buffer[0..num_decoded], levels[0..num_decoded]); } - // Tests when encoded values are larger than encoder's buffer - fn test_internal_roundtrip_overflow( - enc: Encoding, - levels: &[i16], - max_level: i16, - v2: bool, - ) { - let size = max_buffer_size(enc, max_level, levels.len()); - let mut encoder = if v2 { - LevelEncoder::v2(max_level, vec![0; size]) - } else { - LevelEncoder::v1(enc, max_level, vec![0; size]) - }; - let mut found_err = false; - // Insert a large number of values, so we run out of space - for _ in 0..100 { - if let Err(err) = encoder.put(levels) { - assert!(format!("{}", err).contains("Not enough bytes left")); - found_err = true; - break; - }; - } - if !found_err { - panic!("Failed test: no buffer overflow"); - } - } - #[test] fn test_roundtrip_one() { let levels = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; @@ -470,6 +436,15 @@ mod tests { test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); } + #[test] + fn test_rountrip_max() { + let levels = vec![0, i16::MAX, i16::MAX, i16::MAX, 0]; + let max_level = i16::MAX; + test_internal_roundtrip(Encoding::RLE, &levels, max_level, false); + test_internal_roundtrip(Encoding::BIT_PACKED, &levels, max_level, false); + test_internal_roundtrip(Encoding::RLE, &levels, max_level, true); + } + #[test] fn test_roundtrip_underflow() { let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; @@ -484,15 +459,6 @@ mod tests { test_internal_roundtrip_underflow(Encoding::RLE, &levels, max_level, true); } - #[test] - fn test_roundtrip_overflow() { - let levels = vec![1, 1, 2, 3, 2, 1, 1, 2, 3, 1]; - let max_level = 3; - test_internal_roundtrip_overflow(Encoding::RLE, &levels, max_level, false); - test_internal_roundtrip_overflow(Encoding::BIT_PACKED, &levels, max_level, false); - test_internal_roundtrip_overflow(Encoding::RLE, &levels, max_level, true); - } - #[test] fn test_rle_decoder_set_data_range() { // Buffer containing both repetition and definition levels diff --git a/parquet/src/encodings/rle.rs b/parquet/src/encodings/rle.rs index 8a19dd5452a9..39a0aa4d03da 100644 --- a/parquet/src/encodings/rle.rs +++ b/parquet/src/encodings/rle.rs @@ -45,7 +45,6 @@ use crate::util::{ /// Maximum groups per bit-packed run. Current value is 64. const MAX_GROUPS_PER_BIT_PACKED_RUN: usize = 1 << 6; const MAX_VALUES_PER_BIT_PACKED_RUN: usize = MAX_GROUPS_PER_BIT_PACKED_RUN * 8; -const MAX_WRITER_BUF_SIZE: usize = 1 << 10; /// A RLE/Bit-Packing hybrid encoder. // TODO: tracking memory usage @@ -56,9 +55,6 @@ pub struct RleEncoder { // Underlying writer which holds an internal buffer. bit_writer: BitWriter, - // The maximum byte size a single run can take. - max_run_byte_size: usize, - // Buffered values for bit-packed runs. buffered_values: [u64; 8], @@ -82,26 +78,18 @@ pub struct RleEncoder { } impl RleEncoder { + #[allow(unused)] pub fn new(bit_width: u8, buffer_len: usize) -> Self { - let buffer = vec![0; buffer_len]; - RleEncoder::new_from_buf(bit_width, buffer, 0) - } - - /// Initialize the encoder from existing `buffer` and the starting offset `start`. - pub fn new_from_buf(bit_width: u8, buffer: Vec, start: usize) -> Self { - assert!(bit_width <= 64, "bit_width ({}) out of range.", bit_width); - let max_run_byte_size = RleEncoder::min_buffer_size(bit_width); - assert!( - buffer.len() >= max_run_byte_size, - "buffer length {} must be greater than {}", - buffer.len(), - max_run_byte_size - ); - let bit_writer = BitWriter::new_from_buf(buffer, start); + let buffer = Vec::with_capacity(buffer_len); + RleEncoder::new_from_buf(bit_width, buffer) + } + + /// Initialize the encoder from existing `buffer` + pub fn new_from_buf(bit_width: u8, buffer: Vec) -> Self { + let bit_writer = BitWriter::new_from_buf(buffer); RleEncoder { bit_width, bit_writer, - max_run_byte_size, buffered_values: [0; 8], num_buffered_values: 0, current_value: 0, @@ -139,23 +127,21 @@ impl RleEncoder { } /// Encodes `value`, which must be representable with `bit_width` bits. - /// Returns true if the value fits in buffer, false if it doesn't, or - /// error if something is wrong. #[inline] - pub fn put(&mut self, value: u64) -> Result { + pub fn put(&mut self, value: u64) { // This function buffers 8 values at a time. After seeing 8 values, it // decides whether the current run should be encoded in bit-packed or RLE. if self.current_value == value { self.repeat_count += 1; if self.repeat_count > 8 { // A continuation of last value. No need to buffer. - return Ok(true); + return; } } else { if self.repeat_count >= 8 { // The current RLE run has ended and we've gathered enough. Flush first. assert_eq!(self.bit_packed_count, 0); - self.flush_rle_run()?; + self.flush_rle_run(); } self.repeat_count = 1; self.current_value = value; @@ -166,13 +152,12 @@ impl RleEncoder { if self.num_buffered_values == 8 { // Buffered values are full. Flush them. assert_eq!(self.bit_packed_count % 8, 0); - self.flush_buffered_values()?; + self.flush_buffered_values(); } - - Ok(true) } #[inline] + #[allow(unused)] pub fn buffer(&self) -> &[u8] { self.bit_writer.buffer() } @@ -182,27 +167,30 @@ impl RleEncoder { self.bit_writer.bytes_written() } + #[allow(unused)] pub fn is_empty(&self) -> bool { self.bit_writer.bytes_written() == 0 } #[inline] - pub fn consume(mut self) -> Result> { - self.flush()?; - Ok(self.bit_writer.consume()) + pub fn consume(mut self) -> Vec { + self.flush(); + self.bit_writer.consume() } /// Borrow equivalent of the `consume` method. /// Call `clear()` after invoking this method. #[inline] - pub fn flush_buffer(&mut self) -> Result<&[u8]> { - self.flush()?; - Ok(self.bit_writer.flush_buffer()) + #[allow(unused)] + pub fn flush_buffer(&mut self) -> &[u8] { + self.flush(); + self.bit_writer.flush_buffer() } /// Clears the internal state so this encoder can be reused (e.g., after becoming /// full). #[inline] + #[allow(unused)] pub fn clear(&mut self) { self.bit_writer.clear(); self.num_buffered_values = 0; @@ -215,7 +203,7 @@ impl RleEncoder { /// Flushes all remaining values and return the final byte buffer maintained by the /// internal writer. #[inline] - pub fn flush(&mut self) -> Result<()> { + pub fn flush(&mut self) { if self.bit_packed_count > 0 || self.repeat_count > 0 || self.num_buffered_values > 0 @@ -224,7 +212,7 @@ impl RleEncoder { && (self.repeat_count == self.num_buffered_values || self.num_buffered_values == 0); if self.repeat_count > 0 && all_repeat { - self.flush_rle_run()?; + self.flush_rle_run(); } else { // Buffer the last group of bit-packed values to 8 by padding with 0s. if self.num_buffered_values > 0 { @@ -234,38 +222,32 @@ impl RleEncoder { } } self.bit_packed_count += self.num_buffered_values; - self.flush_bit_packed_run(true)?; + self.flush_bit_packed_run(true); self.repeat_count = 0; } } - Ok(()) } - fn flush_rle_run(&mut self) -> Result<()> { + fn flush_rle_run(&mut self) { assert!(self.repeat_count > 0); let indicator_value = self.repeat_count << 1; - let mut result = self.bit_writer.put_vlq_int(indicator_value as u64); - result &= self.bit_writer.put_aligned( + self.bit_writer.put_vlq_int(indicator_value as u64); + self.bit_writer.put_aligned( self.current_value, bit_util::ceil(self.bit_width as i64, 8) as usize, ); - if !result { - return Err(general_err!("Failed to write RLE run")); - } self.num_buffered_values = 0; self.repeat_count = 0; - Ok(()) } - fn flush_bit_packed_run(&mut self, update_indicator_byte: bool) -> Result<()> { + fn flush_bit_packed_run(&mut self, update_indicator_byte: bool) { if self.indicator_byte_pos < 0 { - self.indicator_byte_pos = self.bit_writer.skip(1)? as i64; + self.indicator_byte_pos = self.bit_writer.skip(1) as i64; } // Write all buffered values as bit-packed literals for i in 0..self.num_buffered_values { - let _ = self - .bit_writer + self.bit_writer .put_value(self.buffered_values[i], self.bit_width as usize); } self.num_buffered_values = 0; @@ -273,30 +255,27 @@ impl RleEncoder { // Write the indicator byte to the reserved position in `bit_writer` let num_groups = self.bit_packed_count / 8; let indicator_byte = ((num_groups << 1) | 1) as u8; - if !self.bit_writer.put_aligned_offset( + self.bit_writer.put_aligned_offset( indicator_byte, 1, self.indicator_byte_pos as usize, - ) { - return Err(general_err!("Not enough space to write indicator byte")); - } + ); self.indicator_byte_pos = -1; self.bit_packed_count = 0; } - Ok(()) } #[inline(never)] - fn flush_buffered_values(&mut self) -> Result<()> { + fn flush_buffered_values(&mut self) { if self.repeat_count >= 8 { self.num_buffered_values = 0; if self.bit_packed_count > 0 { // In this case we choose RLE encoding. Flush the current buffered values // as bit-packed encoding. assert_eq!(self.bit_packed_count % 8, 0); - self.flush_bit_packed_run(true)? + self.flush_bit_packed_run(true) } - return Ok(()); + return; } self.bit_packed_count += self.num_buffered_values; @@ -305,12 +284,11 @@ impl RleEncoder { // We've reached the maximum value that can be hold in a single bit-packed // run. assert!(self.indicator_byte_pos >= 0); - self.flush_bit_packed_run(true)?; + self.flush_bit_packed_run(true); } else { - self.flush_bit_packed_run(false)?; + self.flush_bit_packed_run(false); } self.repeat_count = 0; - Ok(()) } } @@ -443,7 +421,8 @@ impl RleDecoder { let mut values_skipped = 0; while values_skipped < num_values { if self.rle_left > 0 { - let num_values = cmp::min(num_values - values_skipped, self.rle_left as usize); + let num_values = + cmp::min(num_values - values_skipped, self.rle_left as usize); self.rle_left -= num_values as u32; values_skipped += num_values; } else if self.bit_packed_left > 0 { @@ -452,10 +431,7 @@ impl RleDecoder { let bit_reader = self.bit_reader.as_mut().expect("bit_reader should be set"); - num_values = bit_reader.skip( - num_values, - self.bit_width as usize, - ); + num_values = bit_reader.skip(num_values, self.bit_width as usize); if num_values == 0 { // Handle writers which truncate the final block self.bit_packed_left = 0; @@ -587,7 +563,9 @@ mod tests { assert_eq!(skipped, 2); let mut buffer = vec![0; 6]; - let remaining = decoder.get_batch::(&mut buffer).expect("getting remaining"); + let remaining = decoder + .get_batch::(&mut buffer) + .expect("getting remaining"); assert_eq!(remaining, 6); assert_eq!(buffer, expected); } @@ -598,11 +576,11 @@ mod tests { let mut encoder1 = RleEncoder::new(3, 256); let mut encoder2 = RleEncoder::new(3, 256); for value in data { - encoder1.put(value as u64).unwrap(); - encoder2.put(value as u64).unwrap(); + encoder1.put(value as u64); + encoder2.put(value as u64); } - let res1 = encoder1.flush_buffer().unwrap(); - let res2 = encoder2.consume().unwrap(); + let res1 = encoder1.flush_buffer(); + let res2 = encoder2.consume(); assert_eq!(res1, &res2[..]); } @@ -671,7 +649,9 @@ mod tests { let skipped = decoder.skip(50).expect("skipping first 50"); assert_eq!(skipped, 50); - let remainder = decoder.get_batch::(&mut buffer).expect("getting remaining 50"); + let remainder = decoder + .get_batch::(&mut buffer) + .expect("getting remaining 50"); assert_eq!(remainder, 50); assert_eq!(buffer, expected); @@ -687,7 +667,9 @@ mod tests { } let skipped = decoder.skip(50).expect("skipping first 50"); assert_eq!(skipped, 50); - let remainder = decoder.get_batch::(&mut buffer).expect("getting remaining 50"); + let remainder = decoder + .get_batch::(&mut buffer) + .expect("getting remaining 50"); assert_eq!(remainder, 50); assert_eq!(buffer, expected); } @@ -739,7 +721,9 @@ mod tests { let expected = vec![10, 20, 20, 20, 20, 30, 30, 30, 30, 30]; let skipped = decoder.skip(2).expect("skipping two values"); assert_eq!(skipped, 2); - let remainder = decoder.get_batch_with_dict::(&dict, &mut buffer, 10).expect("getting remainder"); + let remainder = decoder + .get_batch_with_dict::(&dict, &mut buffer, 10) + .expect("getting remainder"); assert_eq!(remainder, 10); assert_eq!(buffer, expected); @@ -751,17 +735,12 @@ mod tests { let mut decoder: RleDecoder = RleDecoder::new(3); decoder.set_data(data); let mut buffer = vec![""; 8]; - let expected = vec![ - "eee", "fff", "ddd", "eee", "fff", "eee", "fff", - "fff", - ]; + let expected = vec!["eee", "fff", "ddd", "eee", "fff", "eee", "fff", "fff"]; let skipped = decoder.skip(4).expect("skipping four values"); assert_eq!(skipped, 4); - let remainder = decoder.get_batch_with_dict::<&str>( - dict.as_slice(), - buffer.as_mut_slice(), - 8, - ).expect("getting remainder"); + let remainder = decoder + .get_batch_with_dict::<&str>(dict.as_slice(), buffer.as_mut_slice(), 8) + .expect("getting remainder"); assert_eq!(remainder, 8); assert_eq!(buffer, expected); } @@ -775,10 +754,9 @@ mod tests { let buffer_len = 64 * 1024; let mut encoder = RleEncoder::new(bit_width, buffer_len); for v in values { - let result = encoder.put(*v as u64); - assert!(result.is_ok()); + encoder.put(*v as u64) } - let buffer = ByteBufferPtr::new(encoder.consume().expect("Expect consume() OK")); + let buffer = ByteBufferPtr::new(encoder.consume()); if expected_len != -1 { assert_eq!(buffer.len(), expected_len as usize); } @@ -931,9 +909,9 @@ mod tests { let values: Vec = vec![0, 1, 1, 1, 1, 0, 0, 0, 0, 1]; let mut encoder = RleEncoder::new(bit_width, buffer_len); for v in &values { - assert!(encoder.put(*v as u64).expect("put() should be OK")); + encoder.put(*v as u64) } - let buffer = encoder.consume().expect("consume() should be OK"); + let buffer = encoder.consume(); let mut decoder = RleDecoder::new(bit_width); decoder.set_data(ByteBufferPtr::new(buffer)); let mut actual_values: Vec = vec![0; values.len()]; @@ -947,12 +925,10 @@ mod tests { let buffer_len = 64 * 1024; let mut encoder = RleEncoder::new(bit_width, buffer_len); for v in values { - let result = encoder.put(*v as u64).expect("put() should be OK"); - assert!(result, "put() should not return false"); + encoder.put(*v as u64) } - let buffer = - ByteBufferPtr::new(encoder.consume().expect("consume() should be OK")); + let buffer = ByteBufferPtr::new(encoder.consume()); // Verify read let mut decoder = RleDecoder::new(bit_width); diff --git a/parquet/src/errors.rs b/parquet/src/errors.rs index c2fb5bd66cf9..c4f5faaaacae 100644 --- a/parquet/src/errors.rs +++ b/parquet/src/errors.rs @@ -22,7 +22,7 @@ use std::{cell, io, result, str}; #[cfg(any(feature = "arrow", test))] use arrow::error::ArrowError; -#[derive(Debug, PartialEq, Clone)] +#[derive(Debug, PartialEq, Clone, Eq)] pub enum ParquetError { /// General Parquet error. /// Returned when code violates normal workflow of working with Parquet files. @@ -148,8 +148,8 @@ macro_rules! arrow_err { // Convert parquet error into other errors #[cfg(any(feature = "arrow", test))] -impl Into for ParquetError { - fn into(self) -> ArrowError { - ArrowError::ParquetError(format!("{}", self)) +impl From for ArrowError { + fn from(p: ParquetError) -> Self { + Self::ParquetError(format!("{}", p)) } } diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs index 58eaf7a8c875..018dd95d9f35 100644 --- a/parquet/src/file/metadata.rs +++ b/parquet/src/file/metadata.rs @@ -834,6 +834,12 @@ pub struct ColumnIndexBuilder { valid: bool, } +impl Default for ColumnIndexBuilder { + fn default() -> Self { + Self::new() + } +} + impl ColumnIndexBuilder { pub fn new() -> Self { ColumnIndexBuilder { @@ -887,6 +893,12 @@ pub struct OffsetIndexBuilder { current_first_row_index: i64, } +impl Default for OffsetIndexBuilder { + fn default() -> Self { + Self::new() + } +} + impl OffsetIndexBuilder { pub fn new() -> Self { OffsetIndexBuilder { diff --git a/parquet/src/file/page_encoding_stats.rs b/parquet/src/file/page_encoding_stats.rs index 3180c7820802..e499a094ae00 100644 --- a/parquet/src/file/page_encoding_stats.rs +++ b/parquet/src/file/page_encoding_stats.rs @@ -21,7 +21,7 @@ use parquet_format::{ }; /// PageEncodingStats for a column chunk and data page. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct PageEncodingStats { /// the page type (data/dic/...) pub page_type: PageType, diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 45381234c027..f29b80accae2 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -47,6 +47,7 @@ impl PageIndex { } #[derive(Debug, Clone, PartialEq)] +#[allow(non_camel_case_types)] pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index, diff --git a/parquet/src/file/page_index/mod.rs b/parquet/src/file/page_index/mod.rs index fc87ef20448f..bb7808f16487 100644 --- a/parquet/src/file/page_index/mod.rs +++ b/parquet/src/file/page_index/mod.rs @@ -17,4 +17,6 @@ pub mod index; pub mod index_reader; + +#[cfg(test)] pub(crate) mod range; diff --git a/parquet/src/file/page_index/range.rs b/parquet/src/file/page_index/range.rs index 06c06553ccd5..e9741ec8e7fd 100644 --- a/parquet/src/file/page_index/range.rs +++ b/parquet/src/file/page_index/range.rs @@ -213,6 +213,7 @@ impl RowRanges { result } + #[allow(unused)] pub fn row_count(&self) -> usize { self.ranges.iter().map(|x| x.count()).sum() } diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 9ca7c4daa597..57dae323d892 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -68,7 +68,8 @@ const DEFAULT_CREATED_BY: &str = env!("PARQUET_CREATED_BY"); /// Parquet writer version. /// /// Basic constant, which is not part of the Thrift definition. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[allow(non_camel_case_types)] pub enum WriterVersion { PARQUET_1_0, PARQUET_2_0, @@ -360,7 +361,7 @@ impl WriterPropertiesBuilder { fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties { self.column_properties .entry(col) - .or_insert(ColumnProperties::new()) + .or_insert_with(ColumnProperties::new) } /// Sets encoding for a column. diff --git a/parquet/src/file/serialized_reader.rs b/parquet/src/file/serialized_reader.rs index 766813f11aee..0b7451f4bea7 100644 --- a/parquet/src/file/serialized_reader.rs +++ b/parquet/src/file/serialized_reader.rs @@ -39,8 +39,7 @@ use crate::util::{io::TryClone, memory::ByteBufferPtr}; // export `SliceableCursor` and `FileSource` publically so clients can // re-use the logic in their own ParquetFileWriter wrappers -#[allow(deprecated)] -pub use crate::util::{cursor::SliceableCursor, io::FileSource}; +pub use crate::util::io::FileSource; // ---------------------------------------------------------------------- // Implementations of traits facilitating the creation of a new reader @@ -86,22 +85,6 @@ impl ChunkReader for Bytes { } } -#[allow(deprecated)] -impl Length for SliceableCursor { - fn len(&self) -> u64 { - SliceableCursor::len(self) - } -} - -#[allow(deprecated)] -impl ChunkReader for SliceableCursor { - type T = SliceableCursor; - - fn get_read(&self, start: u64, length: usize) -> Result { - self.slice(start, length).map_err(|e| e.into()) - } -} - impl TryFrom for SerializedFileReader { type Error = ParquetError; @@ -155,29 +138,29 @@ pub struct SerializedFileReader { metadata: ParquetMetaData, } +/// A predicate for filtering row groups, invoked with the metadata and index +/// of each row group in the file. Only row groups for which the predicate +/// evaluates to `true` will be scanned +pub type ReadGroupPredicate = Box bool>; + /// A builder for [`ReadOptions`]. /// For the predicates that are added to the builder, /// they will be chained using 'AND' to filter the row groups. +#[derive(Default)] pub struct ReadOptionsBuilder { - predicates: Vec bool>>, + predicates: Vec, enable_page_index: bool, } impl ReadOptionsBuilder { /// New builder pub fn new() -> Self { - ReadOptionsBuilder { - predicates: vec![], - enable_page_index: false, - } + Self::default() } /// Add a predicate on row group metadata to the reading option, /// Filter only row groups that match the predicate criteria - pub fn with_predicate( - mut self, - predicate: Box bool>, - ) -> Self { + pub fn with_predicate(mut self, predicate: ReadGroupPredicate) -> Self { self.predicates.push(predicate); self } @@ -214,7 +197,7 @@ impl ReadOptionsBuilder { /// Currently, only predicates on row group metadata are supported. /// All predicates will be chained using 'AND' to filter the row groups. pub struct ReadOptions { - predicates: Vec bool>>, + predicates: Vec, enable_page_index: bool, } @@ -709,7 +692,7 @@ mod tests { use crate::record::RowAccessor; use crate::schema::parser::parse_message_type; use crate::util::bit_util::from_le_slice; - use crate::util::test_common::{get_test_file, get_test_path}; + use crate::util::test_common::file_util::{get_test_file, get_test_path}; use parquet_format::BoundaryOrder; use std::sync::Arc; @@ -1512,7 +1495,9 @@ mod tests { if i != 351 { assert!((meta.num_rows == 21) || (meta.num_rows == 20)); } else { - assert_eq!(meta.num_rows, 11); + // last page first row index is 7290, total row count is 7300 + // because first row start with zero, last page row count should be 10. + assert_eq!(meta.num_rows, 10); } assert!(!meta.is_dict); vec.push(meta); diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index 467273aaab9d..863ccf85468d 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -40,7 +40,6 @@ use crate::file::{ use crate::schema::types::{ self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, TypePtr, }; -use crate::util::io::TryClone; /// A wrapper around a [`Write`] that keeps track of the number /// of bytes that have been written @@ -109,11 +108,6 @@ pub type OnCloseRowGroup<'a> = Box< + 'a, >; -#[deprecated = "use std::io::Write"] -pub trait ParquetWriter: Write + std::io::Seek + TryClone {} -#[allow(deprecated)] -impl ParquetWriter for T {} - // ---------------------------------------------------------------------- // Serialized impl for file & row group writers diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index d4eaaf41686a..90fe399e78d7 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -19,6 +19,9 @@ //! [Apache Parquet](https://parquet.apache.org/), part of //! the [Apache Arrow](https://arrow.apache.org/) project. //! +//! Please see the [parquet crates.io](https://crates.io/crates/parquet) +//! page for feature flags and tips to improve performance. +//! //! # Getting Started //! Start with some examples: //! @@ -30,14 +33,6 @@ //! //! 3. [arrow::async_reader] for `async` reading and writing parquet //! files to Arrow `RecordBatch`es (requires the `async` feature). -#![allow(dead_code)] -#![allow(non_camel_case_types)] -#![allow( - clippy::from_over_into, - clippy::new_without_default, - clippy::or_fun_call, - clippy::too_many_arguments -)] /// Defines a an item with an experimental public API /// diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs index 0a360fd29648..7e1c484bf881 100644 --- a/parquet/src/record/api.rs +++ b/parquet/src/record/api.rs @@ -27,7 +27,7 @@ use crate::data_type::{ByteArray, Decimal, Int96}; use crate::errors::{ParquetError, Result}; use crate::schema::types::ColumnDescPtr; -#[cfg(any(feature = "cli", test))] +#[cfg(any(feature = "json", test))] use serde_json::Value; /// Macro as a shortcut to generate 'not yet implemented' panic error. @@ -79,7 +79,7 @@ impl Row { } } - #[cfg(any(feature = "cli", test))] + #[cfg(any(feature = "json", test))] pub fn to_json_value(&self) -> Value { Value::Object( self.fields @@ -667,7 +667,7 @@ impl Field { } } - #[cfg(any(feature = "cli", test))] + #[cfg(any(feature = "json", test))] pub fn to_json_value(&self) -> Value { match &self { Field::Null => Value::Null, @@ -1685,7 +1685,6 @@ mod tests { } #[test] - #[cfg(any(feature = "cli", test))] fn test_to_json_value() { assert_eq!(Field::Null.to_json_value(), Value::Null); assert_eq!(Field::Bool(true).to_json_value(), Value::Bool(true)); diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs index 05b63661f09b..0b7e04587354 100644 --- a/parquet/src/record/reader.rs +++ b/parquet/src/record/reader.rs @@ -40,6 +40,12 @@ pub struct TreeBuilder { batch_size: usize, } +impl Default for TreeBuilder { + fn default() -> Self { + Self::new() + } +} + impl TreeBuilder { /// Creates new tree builder with default parameters. pub fn new() -> Self { @@ -822,7 +828,7 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::record::api::{Field, Row, RowAccessor, RowFormatter}; use crate::schema::parser::parse_message_type; - use crate::util::test_common::{get_test_file, get_test_path}; + use crate::util::test_common::file_util::{get_test_file, get_test_path}; use std::convert::TryFrom; // Convenient macros to assemble row, list, map, and group. diff --git a/parquet/src/record/triplet.rs b/parquet/src/record/triplet.rs index de566a122e20..5a7e2a0ca74e 100644 --- a/parquet/src/record/triplet.rs +++ b/parquet/src/record/triplet.rs @@ -363,7 +363,7 @@ mod tests { use crate::file::reader::{FileReader, SerializedFileReader}; use crate::schema::types::ColumnPath; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; #[test] #[should_panic(expected = "Expected positive batch size, found: 0")] diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs index 8d624fe3d185..823803167ca1 100644 --- a/parquet/src/schema/types.rs +++ b/parquet/src/schema/types.rs @@ -593,7 +593,7 @@ impl<'a> GroupTypeBuilder<'a> { /// Basic type info. This contains information such as the name of the type, /// the repetition level, the logical type and the kind of the type (group, primitive). -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct BasicTypeInfo { name: String, repetition: Option, diff --git a/parquet/src/util/bit_pack.rs b/parquet/src/util/bit_pack.rs new file mode 100644 index 000000000000..8cea20de2539 --- /dev/null +++ b/parquet/src/util/bit_pack.rs @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Vectorised bit-packing utilities + +/// Macro that generates an unpack function taking the number of bits as a const generic +macro_rules! unpack_impl { + ($t:ty, $bytes:literal, $bits:tt) => { + pub fn unpack(input: &[u8], output: &mut [$t; $bits]) { + if NUM_BITS == 0 { + for out in output { + *out = 0; + } + return; + } + + assert!(NUM_BITS <= $bytes * 8); + + let mask = match NUM_BITS { + $bits => <$t>::MAX, + _ => ((1 << NUM_BITS) - 1), + }; + + assert!(input.len() >= NUM_BITS * $bytes); + + let r = |output_idx: usize| { + <$t>::from_le_bytes( + input[output_idx * $bytes..output_idx * $bytes + $bytes] + .try_into() + .unwrap(), + ) + }; + + seq_macro::seq!(i in 0..$bits { + let start_bit = i * NUM_BITS; + let end_bit = start_bit + NUM_BITS; + + let start_bit_offset = start_bit % $bits; + let end_bit_offset = end_bit % $bits; + let start_byte = start_bit / $bits; + let end_byte = end_bit / $bits; + if start_byte != end_byte && end_bit_offset != 0 { + let val = r(start_byte); + let a = val >> start_bit_offset; + let val = r(end_byte); + let b = val << (NUM_BITS - end_bit_offset); + + output[i] = a | (b & mask); + } else { + let val = r(start_byte); + output[i] = (val >> start_bit_offset) & mask; + } + }); + } + }; +} + +/// Macro that generates unpack functions that accept num_bits as a parameter +macro_rules! unpack { + ($name:ident, $t:ty, $bytes:literal, $bits:tt) => { + mod $name { + unpack_impl!($t, $bytes, $bits); + } + + /// Unpack packed `input` into `output` with a bit width of `num_bits` + pub fn $name(input: &[u8], output: &mut [$t; $bits], num_bits: usize) { + // This will get optimised into a jump table + seq_macro::seq!(i in 0..=$bits { + if i == num_bits { + return $name::unpack::(input, output); + } + }); + unreachable!("invalid num_bits {}", num_bits); + } + }; +} + +unpack!(unpack8, u8, 1, 8); +unpack!(unpack16, u16, 2, 16); +unpack!(unpack32, u32, 4, 32); +unpack!(unpack64, u64, 8, 64); + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_basic() { + let input = [0xFF; 4096]; + + for i in 0..=8 { + let mut output = [0; 8]; + unpack8(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + + for i in 0..=16 { + let mut output = [0; 16]; + unpack16(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + + for i in 0..=32 { + let mut output = [0; 32]; + unpack32(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + + for i in 0..=64 { + let mut output = [0; 64]; + unpack64(&input, &mut output, i); + for (idx, out) in output.iter().enumerate() { + assert_eq!(out.trailing_ones() as usize, i, "out[{}] = {}", idx, out); + } + } + } +} diff --git a/parquet/src/util/bit_packing.rs b/parquet/src/util/bit_packing.rs deleted file mode 100644 index 758992ab2723..000000000000 --- a/parquet/src/util/bit_packing.rs +++ /dev/null @@ -1,3662 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -/// Unpack 32 values with bit width `num_bits` from `in_ptr`, and write to `out_ptr`. -/// Return the `in_ptr` where the starting offset points to the first byte after all the -/// bytes that were consumed. -// TODO: may be better to make these more compact using if-else conditions. -// However, this may require const generics: -// https://github.com/rust-lang/rust/issues/44580 -// to eliminate the branching cost. -// TODO: we should use SIMD instructions to further optimize this. I have explored -// https://github.com/tantivy-search/bitpacking -// but the layout it uses for SIMD is different from Parquet. -// TODO: support packing as well, which is used for encoding. -pub unsafe fn unpack32( - mut in_ptr: *const u32, - out_ptr: *mut u32, - num_bits: usize, -) -> *const u32 { - in_ptr = match num_bits { - 0 => nullunpacker32(in_ptr, out_ptr), - 1 => unpack1_32(in_ptr, out_ptr), - 2 => unpack2_32(in_ptr, out_ptr), - 3 => unpack3_32(in_ptr, out_ptr), - 4 => unpack4_32(in_ptr, out_ptr), - 5 => unpack5_32(in_ptr, out_ptr), - 6 => unpack6_32(in_ptr, out_ptr), - 7 => unpack7_32(in_ptr, out_ptr), - 8 => unpack8_32(in_ptr, out_ptr), - 9 => unpack9_32(in_ptr, out_ptr), - 10 => unpack10_32(in_ptr, out_ptr), - 11 => unpack11_32(in_ptr, out_ptr), - 12 => unpack12_32(in_ptr, out_ptr), - 13 => unpack13_32(in_ptr, out_ptr), - 14 => unpack14_32(in_ptr, out_ptr), - 15 => unpack15_32(in_ptr, out_ptr), - 16 => unpack16_32(in_ptr, out_ptr), - 17 => unpack17_32(in_ptr, out_ptr), - 18 => unpack18_32(in_ptr, out_ptr), - 19 => unpack19_32(in_ptr, out_ptr), - 20 => unpack20_32(in_ptr, out_ptr), - 21 => unpack21_32(in_ptr, out_ptr), - 22 => unpack22_32(in_ptr, out_ptr), - 23 => unpack23_32(in_ptr, out_ptr), - 24 => unpack24_32(in_ptr, out_ptr), - 25 => unpack25_32(in_ptr, out_ptr), - 26 => unpack26_32(in_ptr, out_ptr), - 27 => unpack27_32(in_ptr, out_ptr), - 28 => unpack28_32(in_ptr, out_ptr), - 29 => unpack29_32(in_ptr, out_ptr), - 30 => unpack30_32(in_ptr, out_ptr), - 31 => unpack31_32(in_ptr, out_ptr), - 32 => unpack32_32(in_ptr, out_ptr), - _ => unimplemented!(), - }; - in_ptr -} - -unsafe fn nullunpacker32(in_buf: *const u32, mut out: *mut u32) -> *const u32 { - for _ in 0..32 { - *out = 0; - out = out.offset(1); - } - in_buf -} - -unsafe fn unpack1_32(in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 1) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 2) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 3) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 5) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 6) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 7) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 9) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 11) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 13) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 17) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 19) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 21) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 23) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 25) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 26) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 27) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 29) & 1; - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 30) & 1; - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - - in_buf.offset(1) -} - -unsafe fn unpack2_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 2); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - out = out.offset(1); - in_buf = in_buf.offset(1); - *out = (in_buf.read_unaligned()) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 2); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 2); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - - in_buf.offset(1) -} - -unsafe fn unpack3_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 27) % (1u32 << 3); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (3 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 25) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 3); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (3 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 23) % (1u32 << 3); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 3); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - - in_buf.offset(1) -} - -unsafe fn unpack4_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4); - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4); - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4); - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 4); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 4); - - in_buf.offset(1) -} - -unsafe fn unpack5_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 25) % (1u32 << 5); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (5 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 23) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 28) % (1u32 << 5); - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (5 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 26) % (1u32 << 5); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (5 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 5); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (5 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 5); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 5); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - - in_buf.offset(1) -} - -unsafe fn unpack6_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 6); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (6 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 6); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (6 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 6); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 6); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (6 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 6); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (6 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 6); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 6); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - - in_buf.offset(1) -} - -unsafe fn unpack7_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 7); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (7 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 24) % (1u32 << 7); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (7 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 7); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (7 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 23) % (1u32 << 7); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (7 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 7); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (7 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 7); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (7 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 7); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 7); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - - in_buf.offset(1) -} - -unsafe fn unpack8_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 8); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 8); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - - in_buf.offset(1) -} - -unsafe fn unpack9_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (9 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 22) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (9 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (9 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 21) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (9 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (9 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (9 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (9 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (9 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 9); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 9); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - - in_buf.offset(1) -} - -unsafe fn unpack10_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (10 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (10 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (10 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (10 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (10 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (10 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (10 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (10 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 10); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 10); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - - in_buf.offset(1) -} - -unsafe fn unpack11_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (11 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (11 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (11 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (11 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (11 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (11 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (11 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (11 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 19) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (11 - 9); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 11); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 20) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (11 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 11); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 21; - - in_buf.offset(1) -} - -unsafe fn unpack12_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (12 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 12); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (12 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 12); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - - in_buf.offset(1) -} - -unsafe fn unpack13_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 13); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (13 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (13 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 13); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (13 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (13 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 13); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (13 - 9); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (13 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 13); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (13 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (13 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 13); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 17) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (13 - 11); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (13 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 13); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 18) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (13 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (13 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 13); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 19; - - in_buf.offset(1) -} - -unsafe fn unpack14_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 14); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (14 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (14 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (14 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 14); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (14 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (14 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (14 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 14); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (14 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (14 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (14 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 14); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (14 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (14 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (14 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 14); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 18; - - in_buf.offset(1) -} - -unsafe fn unpack15_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 15); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 15) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (15 - 13); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (15 - 11); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (15 - 9); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (15 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (15 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (15 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (15 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 15); - out = out.offset(1); - *out = ((in_buf.read_unaligned()) >> 16) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (15 - 14); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (15 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (15 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (15 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (15 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (15 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (15 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 15); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 17; - - in_buf.offset(1) -} - -unsafe fn unpack16_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - out = out.offset(1); - in_buf = in_buf.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 16); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 16; - - in_buf.offset(1) -} - -unsafe fn unpack17_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (17 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (17 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (17 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (17 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (17 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (17 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (17 - 14); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 14) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (17 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (17 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (17 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (17 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (17 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (17 - 9); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (17 - 11); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (17 - 13); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 13) % (1u32 << 17); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (17 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - - in_buf.offset(1) -} - -unsafe fn unpack18_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (18 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (18 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (18 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (18 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (18 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (18 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (18 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (18 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (18 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (18 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (18 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (18 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (18 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (18 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (18 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 18); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (18 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - - in_buf.offset(1) -} - -unsafe fn unpack19_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (19 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (19 - 12); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 12) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (19 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (19 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (19 - 11); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 11) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (19 - 17); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (19 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (19 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (19 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (19 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (19 - 9); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (19 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (19 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (19 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (19 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (19 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (19 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 19); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (19 - 13); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 13; - - in_buf.offset(1) -} - -unsafe fn unpack20_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (20 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (20 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (20 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 20); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (20 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - - in_buf.offset(1) -} - -unsafe fn unpack21_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (21 - 10); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 10) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (21 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (21 - 9); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 9) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (21 - 19); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (21 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (21 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (21 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (21 - 17); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (21 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (21 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (21 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (21 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (21 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (21 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (21 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (21 - 13); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 13; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (21 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (21 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (21 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 21); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (21 - 11); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 11; - - in_buf.offset(1) -} - -unsafe fn unpack22_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (22 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (22 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (22 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (22 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (22 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (22 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (22 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (22 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (22 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (22 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (22 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (22 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (22 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (22 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (22 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (22 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (22 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (22 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 22); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (22 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (22 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - - in_buf.offset(1) -} - -unsafe fn unpack23_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (23 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (23 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (23 - 19); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (23 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (23 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (23 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (23 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (23 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (23 - 11); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 11; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (23 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (23 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (23 - 7); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 7) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (23 - 21); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (23 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (23 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (23 - 17); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (23 - 8); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 8) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (23 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (23 - 13); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 13; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (23 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 23); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (23 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (23 - 9); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 9; - - in_buf.offset(1) -} - -unsafe fn unpack24_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 24); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (24 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (24 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - - in_buf.offset(1) -} - -unsafe fn unpack25_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 25); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (25 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (25 - 11); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 11; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (25 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 25); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (25 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (25 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (25 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (25 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 25); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (25 - 19); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (25 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (25 - 5); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 5) % (1u32 << 25); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (25 - 23); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (25 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (25 - 9); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 9; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (25 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 25); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (25 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (25 - 13); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 13; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (25 - 6); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 6) % (1u32 << 25); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (25 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (25 - 17); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (25 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (25 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 25); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (25 - 21); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (25 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (25 - 7); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 7; - - in_buf.offset(1) -} - -unsafe fn unpack26_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 26); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (26 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (26 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (26 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (26 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 26); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (26 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (26 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (26 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (26 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 26); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (26 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (26 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (26 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (26 - 6); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 6; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 26); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (26 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (26 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (26 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (26 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 26); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (26 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (26 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (26 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (26 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 26); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (26 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (26 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (26 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (26 - 6); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 6; - - in_buf.offset(1) -} - -unsafe fn unpack27_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 27); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (27 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (27 - 17); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (27 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (27 - 7); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 7; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (27 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 27); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (27 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (27 - 19); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (27 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (27 - 9); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 9; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (27 - 4); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 4) % (1u32 << 27); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (27 - 26); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (27 - 21); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (27 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (27 - 11); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 11; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (27 - 6); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 6; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (27 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 27); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (27 - 23); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (27 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (27 - 13); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 13; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (27 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (27 - 3); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 3) % (1u32 << 27); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 25)) << (27 - 25); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (27 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (27 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (27 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (27 - 5); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 5; - - in_buf.offset(1) -} - -unsafe fn unpack28_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 28); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 28); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 28); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 28); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (28 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (28 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (28 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (28 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (28 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (28 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - - in_buf.offset(1) -} - -unsafe fn unpack29_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 29); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (29 - 26); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (29 - 23); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (29 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (29 - 17); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (29 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (29 - 11); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 11; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (29 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (29 - 5); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 5; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (29 - 2); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 2) % (1u32 << 29); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (29 - 28); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 25)) << (29 - 25); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (29 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (29 - 19); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (29 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (29 - 13); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 13; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (29 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (29 - 7); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 7; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (29 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (29 - 1); - out = out.offset(1); - - *out = ((in_buf.read_unaligned()) >> 1) % (1u32 << 29); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 27)) << (29 - 27); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (29 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (29 - 21); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (29 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (29 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (29 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (29 - 9); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 9; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (29 - 6); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 6; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (29 - 3); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 3; - - in_buf.offset(1) -} - -unsafe fn unpack30_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 30); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (30 - 28); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (30 - 26); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (30 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (30 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (30 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (30 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (30 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (30 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (30 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (30 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (30 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (30 - 6); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 6; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (30 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (30 - 2); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 2; - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) % (1u32 << 30); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (30 - 28); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (30 - 26); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (30 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (30 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (30 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (30 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (30 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (30 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (30 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (30 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (30 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (30 - 6); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 6; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (30 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (30 - 2); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 2; - - in_buf.offset(1) -} - -unsafe fn unpack31_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = (in_buf.read_unaligned()) % (1u32 << 31); - out = out.offset(1); - *out = (in_buf.read_unaligned()) >> 31; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 30)) << (31 - 30); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 30; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 29)) << (31 - 29); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 29; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 28)) << (31 - 28); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 28; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 27)) << (31 - 27); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 27; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 26)) << (31 - 26); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 26; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 25)) << (31 - 25); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 25; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 24)) << (31 - 24); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 24; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 23)) << (31 - 23); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 23; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 22)) << (31 - 22); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 22; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 21)) << (31 - 21); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 21; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 20)) << (31 - 20); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 20; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 19)) << (31 - 19); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 19; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 18)) << (31 - 18); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 18; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 17)) << (31 - 17); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 17; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 16)) << (31 - 16); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 16; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 15)) << (31 - 15); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 15; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 14)) << (31 - 14); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 14; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 13)) << (31 - 13); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 13; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 12)) << (31 - 12); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 12; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 11)) << (31 - 11); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 11; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 10)) << (31 - 10); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 10; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 9)) << (31 - 9); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 9; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 8)) << (31 - 8); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 8; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 7)) << (31 - 7); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 7; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 6)) << (31 - 6); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 6; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 5)) << (31 - 5); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 5; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 4)) << (31 - 4); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 4; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 3)) << (31 - 3); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 3; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 2)) << (31 - 2); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 2; - in_buf = in_buf.offset(1); - *out |= ((in_buf.read_unaligned()) % (1u32 << 1)) << (31 - 1); - out = out.offset(1); - - *out = (in_buf.read_unaligned()) >> 1; - - in_buf.offset(1) -} - -unsafe fn unpack32_32(mut in_buf: *const u32, mut out: *mut u32) -> *const u32 { - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - in_buf = in_buf.offset(1); - out = out.offset(1); - - *out = in_buf.read_unaligned(); - - in_buf.offset(1) -} diff --git a/parquet/src/util/bit_util.rs b/parquet/src/util/bit_util.rs index 29269c4ad7e2..5d76a8dbf47d 100644 --- a/parquet/src/util/bit_util.rs +++ b/parquet/src/util/bit_util.rs @@ -18,8 +18,8 @@ use std::{cmp, mem::size_of}; use crate::data_type::AsBytes; -use crate::errors::{ParquetError, Result}; -use crate::util::{bit_packing::unpack32, memory::ByteBufferPtr}; +use crate::util::bit_pack::{unpack16, unpack32, unpack64, unpack8}; +use crate::util::memory::ByteBufferPtr; #[inline] pub fn from_ne_slice(bs: &[u8]) -> T { @@ -88,49 +88,17 @@ impl FromBytes for bool { from_le_bytes! { u8, u16, u32, u64, i8, i16, i32, i64, f32, f64 } -/// Reads `$size` of bytes from `$src`, and reinterprets them as type `$ty`, in -/// little-endian order. `$ty` must implement the `Default` trait. Otherwise this won't -/// compile. +/// Reads `size` of bytes from `src`, and reinterprets them as type `ty`, in +/// little-endian order. /// This is copied and modified from byteorder crate. -macro_rules! read_num_bytes { - ($ty:ty, $size:expr, $src:expr) => {{ - assert!($size <= $src.len()); - let mut buffer = <$ty as $crate::util::bit_util::FromBytes>::Buffer::default(); - buffer.as_mut()[..$size].copy_from_slice(&$src[..$size]); - <$ty>::from_ne_bytes(buffer) - }}; -} - -/// Converts value `val` of type `T` to a byte vector, by reading `num_bytes` from `val`. -/// NOTE: if `val` is less than the size of `T` then it can be truncated. -#[inline] -pub fn convert_to_bytes(val: &T, num_bytes: usize) -> Vec +pub(crate) fn read_num_bytes(size: usize, src: &[u8]) -> T where - T: ?Sized + AsBytes, + T: FromBytes, { - let mut bytes: Vec = vec![0; num_bytes]; - memcpy_value(val.as_bytes(), num_bytes, &mut bytes); - bytes -} - -#[inline] -pub fn memcpy(source: &[u8], target: &mut [u8]) { - assert!(target.len() >= source.len()); - target[..source.len()].copy_from_slice(source) -} - -#[inline] -pub fn memcpy_value(source: &T, num_bytes: usize, target: &mut [u8]) -where - T: ?Sized + AsBytes, -{ - assert!( - target.len() >= num_bytes, - "Not enough space. Only had {} bytes but need to put {} bytes", - target.len(), - num_bytes - ); - memcpy(&source.as_bytes()[..num_bytes], target) + assert!(size <= src.len()); + let mut buffer = ::Buffer::default(); + buffer.as_mut()[..size].copy_from_slice(&src[..size]); + ::from_ne_bytes(buffer) } /// Returns the ceil of value/divisor. @@ -138,7 +106,7 @@ where /// This function should be removed after /// [`int_roundings`](https://github.com/rust-lang/rust/issues/88581) is stable. #[inline] -pub fn ceil(value: i64, divisor: i64) -> i64 { +pub fn ceil(value: T, divisor: T) -> T { num::Integer::div_ceil(&value, &divisor) } @@ -148,20 +116,10 @@ pub fn trailing_bits(v: u64, num_bits: usize) -> u64 { if num_bits >= 64 { v } else { - v & ((1< u8 { @@ -180,59 +138,32 @@ pub fn get_bit(data: &[u8], i: usize) -> bool { /// bit packed or byte aligned fashion. pub struct BitWriter { buffer: Vec, - max_bytes: usize, buffered_values: u64, - byte_offset: usize, - bit_offset: usize, - start: usize, + bit_offset: u8, } impl BitWriter { pub fn new(max_bytes: usize) -> Self { Self { - buffer: vec![0; max_bytes], - max_bytes, + buffer: Vec::with_capacity(max_bytes), buffered_values: 0, - byte_offset: 0, bit_offset: 0, - start: 0, } } - /// Initializes the writer from the existing buffer `buffer` and starting - /// offset `start`. - pub fn new_from_buf(buffer: Vec, start: usize) -> Self { - assert!(start < buffer.len()); - let len = buffer.len(); + /// Initializes the writer appending to the existing buffer `buffer` + pub fn new_from_buf(buffer: Vec) -> Self { Self { buffer, - max_bytes: len, buffered_values: 0, - byte_offset: start, bit_offset: 0, - start, } } - /// Extend buffer size by `increment` bytes - #[inline] - pub fn extend(&mut self, increment: usize) { - self.max_bytes += increment; - let extra = vec![0; increment]; - self.buffer.extend(extra); - } - - /// Report buffer size, in bytes - #[inline] - pub fn capacity(&mut self) -> usize { - self.max_bytes - } - /// Consumes and returns the current buffer. #[inline] pub fn consume(mut self) -> Vec { self.flush(); - self.buffer.truncate(self.byte_offset); self.buffer } @@ -241,53 +172,37 @@ impl BitWriter { #[inline] pub fn flush_buffer(&mut self) -> &[u8] { self.flush(); - &self.buffer()[0..self.byte_offset] + self.buffer() } /// Clears the internal state so the buffer can be reused. #[inline] pub fn clear(&mut self) { + self.buffer.clear(); self.buffered_values = 0; - self.byte_offset = self.start; self.bit_offset = 0; } /// Flushes the internal buffered bits and the align the buffer to the next byte. #[inline] pub fn flush(&mut self) { - let num_bytes = ceil(self.bit_offset as i64, 8) as usize; - assert!(self.byte_offset + num_bytes <= self.max_bytes); - memcpy_value( - &self.buffered_values, - num_bytes, - &mut self.buffer[self.byte_offset..], - ); + let num_bytes = ceil(self.bit_offset, 8); + let slice = &self.buffered_values.to_le_bytes()[..num_bytes as usize]; + self.buffer.extend_from_slice(slice); self.buffered_values = 0; self.bit_offset = 0; - self.byte_offset += num_bytes; } /// Advances the current offset by skipping `num_bytes`, flushing the internal bit /// buffer first. /// This is useful when you want to jump over `num_bytes` bytes and come back later /// to fill these bytes. - /// - /// Returns error if `num_bytes` is beyond the boundary of the internal buffer. - /// Otherwise, returns the old offset. #[inline] - pub fn skip(&mut self, num_bytes: usize) -> Result { + pub fn skip(&mut self, num_bytes: usize) -> usize { self.flush(); - assert!(self.byte_offset <= self.max_bytes); - if self.byte_offset + num_bytes > self.max_bytes { - return Err(general_err!( - "Not enough bytes left in BitWriter. Need {} but only have {}", - self.byte_offset + num_bytes, - self.max_bytes - )); - } - let result = self.byte_offset; - self.byte_offset += num_bytes; - Ok(result) + let result = self.buffer.len(); + self.buffer.extend(std::iter::repeat(0).take(num_bytes)); + result } /// Returns a slice containing the next `num_bytes` bytes starting from the current @@ -295,32 +210,24 @@ impl BitWriter { /// This is useful when you want to jump over `num_bytes` bytes and come back later /// to fill these bytes. #[inline] - pub fn get_next_byte_ptr(&mut self, num_bytes: usize) -> Result<&mut [u8]> { - let offset = self.skip(num_bytes)?; - Ok(&mut self.buffer[offset..offset + num_bytes]) + pub fn get_next_byte_ptr(&mut self, num_bytes: usize) -> &mut [u8] { + let offset = self.skip(num_bytes); + &mut self.buffer[offset..offset + num_bytes] } #[inline] pub fn bytes_written(&self) -> usize { - self.byte_offset - self.start + ceil(self.bit_offset as i64, 8) as usize + self.buffer.len() + ceil(self.bit_offset, 8) as usize } #[inline] pub fn buffer(&self) -> &[u8] { - &self.buffer[self.start..] + &self.buffer } #[inline] pub fn byte_offset(&self) -> usize { - self.byte_offset - } - - /// Returns the internal buffer length. This is the maximum number of bytes that this - /// writer can write. User needs to call `consume` to consume the current buffer - /// before more data can be written. - #[inline] - pub fn buffer_len(&self) -> usize { - self.max_bytes + self.buffer.len() } /// Writes the entire byte `value` at the byte `offset` @@ -330,53 +237,36 @@ impl BitWriter { /// Writes the `num_bits` LSB of value `v` to the internal buffer of this writer. /// The `num_bits` must not be greater than 64. This is bit packed. - /// - /// Returns false if there's not enough room left. True otherwise. #[inline] - pub fn put_value(&mut self, v: u64, num_bits: usize) -> bool { + pub fn put_value(&mut self, v: u64, num_bits: usize) { assert!(num_bits <= 64); + let num_bits = num_bits as u8; assert_eq!(v.checked_shr(num_bits as u32).unwrap_or(0), 0); // covers case v >> 64 - if self.byte_offset * 8 + self.bit_offset + num_bits > self.max_bytes as usize * 8 - { - return false; - } - + // Add value to buffered_values self.buffered_values |= v << self.bit_offset; self.bit_offset += num_bits; - if self.bit_offset >= 64 { - memcpy_value( - &self.buffered_values, - 8, - &mut self.buffer[self.byte_offset..], - ); - self.byte_offset += 8; - self.bit_offset -= 64; - self.buffered_values = 0; + if let Some(remaining) = self.bit_offset.checked_sub(64) { + self.buffer + .extend_from_slice(&self.buffered_values.to_le_bytes()); + self.bit_offset = remaining; + // Perform checked right shift: v >> offset, where offset < 64, otherwise we // shift all bits self.buffered_values = v .checked_shr((num_bits - self.bit_offset) as u32) .unwrap_or(0); } - assert!(self.bit_offset < 64); - true } /// Writes `val` of `num_bytes` bytes to the next aligned byte. If size of `T` is /// larger than `num_bytes`, extra higher ordered bytes will be ignored. - /// - /// Returns false if there's not enough room left. True otherwise. #[inline] - pub fn put_aligned(&mut self, val: T, num_bytes: usize) -> bool { - let result = self.get_next_byte_ptr(num_bytes); - if result.is_err() { - // TODO: should we return `Result` for this func? - return false; - } - let ptr = result.unwrap(); - memcpy_value(&val, num_bytes, ptr); - true + pub fn put_aligned(&mut self, val: T, num_bytes: usize) { + self.flush(); + let slice = val.as_bytes(); + let len = num_bytes.min(slice.len()); + self.buffer.extend_from_slice(&slice[..len]); } /// Writes `val` of `num_bytes` bytes at the designated `offset`. The `offset` is the @@ -384,49 +274,34 @@ impl BitWriter { /// maintains. Note that this will overwrite any existing data between `offset` and /// `offset + num_bytes`. Also that if size of `T` is larger than `num_bytes`, extra /// higher ordered bytes will be ignored. - /// - /// Returns false if there's not enough room left, or the `pos` is not valid. - /// True otherwise. #[inline] pub fn put_aligned_offset( &mut self, val: T, num_bytes: usize, offset: usize, - ) -> bool { - if num_bytes + offset > self.max_bytes { - return false; - } - memcpy_value( - &val, - num_bytes, - &mut self.buffer[offset..offset + num_bytes], - ); - true + ) { + let slice = val.as_bytes(); + let len = num_bytes.min(slice.len()); + self.buffer[offset..offset + len].copy_from_slice(&slice[..len]) } /// Writes a VLQ encoded integer `v` to this buffer. The value is byte aligned. - /// - /// Returns false if there's not enough room left. True otherwise. #[inline] - pub fn put_vlq_int(&mut self, mut v: u64) -> bool { - let mut result = true; + pub fn put_vlq_int(&mut self, mut v: u64) { while v & 0xFFFFFFFFFFFFFF80 != 0 { - result &= self.put_aligned::(((v & 0x7F) | 0x80) as u8, 1); + self.put_aligned::(((v & 0x7F) | 0x80) as u8, 1); v >>= 7; } - result &= self.put_aligned::((v & 0x7F) as u8, 1); - result + self.put_aligned::((v & 0x7F) as u8, 1); } /// Writes a zigzag-VLQ encoded (in little endian order) int `v` to this buffer. /// Zigzag-VLQ is a variant of VLQ encoding where negative and positive /// numbers are encoded in a zigzag fashion. /// See: https://developers.google.com/protocol-buffers/docs/encoding - /// - /// Returns false if there's not enough room left. True otherwise. #[inline] - pub fn put_zigzag_vlq_int(&mut self, v: i64) -> bool { + pub fn put_zigzag_vlq_int(&mut self, v: i64) { let u: u64 = ((v << 1) ^ (v >> 63)) as u64; self.put_vlq_int(u) } @@ -466,7 +341,7 @@ impl BitReader { pub fn new(buffer: ByteBufferPtr) -> Self { let total_bytes = buffer.len(); let num_bytes = cmp::min(8, total_bytes); - let buffered_values = read_num_bytes!(u64, num_bytes, buffer.as_ref()); + let buffered_values = read_num_bytes::(num_bytes, buffer.as_ref()); BitReader { buffer, buffered_values, @@ -480,7 +355,7 @@ impl BitReader { self.buffer = buffer; self.total_bytes = self.buffer.len(); let num_bytes = cmp::min(8, self.total_bytes); - self.buffered_values = read_num_bytes!(u64, num_bytes, self.buffer.as_ref()); + self.buffered_values = read_num_bytes::(num_bytes, self.buffer.as_ref()); self.byte_offset = 0; self.bit_offset = 0; } @@ -488,7 +363,7 @@ impl BitReader { /// Gets the current byte offset #[inline] pub fn get_byte_offset(&self) -> usize { - self.byte_offset + ceil(self.bit_offset as i64, 8) as usize + self.byte_offset + ceil(self.bit_offset, 8) } /// Reads a value of type `T` and of size `num_bits`. @@ -541,12 +416,13 @@ impl BitReader { true } - /// Read multiple values from their packed representation + /// Read multiple values from their packed representation where each element is represented + /// by `num_bits` bits. /// /// # Panics /// /// This function panics if - /// - `bit_width` is larger than the bit-capacity of `T` + /// - `num_bits` is larger than the bit-capacity of `T` /// pub fn get_batch(&mut self, batch: &mut [T], num_bits: usize) -> usize { assert!(num_bits <= size_of::() * 8); @@ -560,17 +436,6 @@ impl BitReader { let mut i = 0; - if num_bits > 32 { - // No fast path - read values individually - while i < values_to_read { - batch[i] = self - .get_value(num_bits) - .expect("expected to have more data"); - i += 1; - } - return values_to_read - } - // First align bit offset to byte offset if self.bit_offset != 0 { while i < values_to_read && self.bit_offset != 0 { @@ -581,46 +446,104 @@ impl BitReader { } } - let in_buf = &self.buffer.data()[self.byte_offset..]; - let mut in_ptr = in_buf as *const [u8] as *const u8 as *const u32; - if size_of::() == 4 { - while values_to_read - i >= 32 { - let out_ptr = &mut batch[i..] as *mut [T] as *mut T as *mut u32; - in_ptr = unsafe { unpack32(in_ptr, out_ptr, num_bits) }; - self.byte_offset += 4 * num_bits; - i += 32; + let in_buf = self.buffer.data(); + + // Read directly into output buffer + match size_of::() { + 1 => { + let ptr = batch.as_mut_ptr() as *mut u8; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 8 { + let out_slice = (&mut out[i..i + 8]).try_into().unwrap(); + unpack8(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += num_bits; + i += 8; + } + } + 2 => { + let ptr = batch.as_mut_ptr() as *mut u16; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 16 { + let out_slice = (&mut out[i..i + 16]).try_into().unwrap(); + unpack16(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += 2 * num_bits; + i += 16; + } } - } else { - let mut out_buf = [0u32; 32]; - let out_ptr = &mut out_buf as &mut [u32] as *mut [u32] as *mut u32; - while values_to_read - i >= 32 { - in_ptr = unsafe { unpack32(in_ptr, out_ptr, num_bits) }; - self.byte_offset += 4 * num_bits; - - for out in out_buf { - // Zero-allocate buffer - let mut out_bytes = T::Buffer::default(); - let in_bytes = out.to_le_bytes(); - - { - let out_bytes = out_bytes.as_mut(); - let len = out_bytes.len().min(in_bytes.len()); - (&mut out_bytes[..len]).copy_from_slice(&in_bytes[..len]); - } - - batch[i] = T::from_le_bytes(out_bytes); - i += 1; + 4 => { + let ptr = batch.as_mut_ptr() as *mut u32; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 32 { + let out_slice = (&mut out[i..i + 32]).try_into().unwrap(); + unpack32(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += 4 * num_bits; + i += 32; } } + 8 => { + let ptr = batch.as_mut_ptr() as *mut u64; + let out = unsafe { std::slice::from_raw_parts_mut(ptr, batch.len()) }; + while values_to_read - i >= 64 { + let out_slice = (&mut out[i..i + 64]).try_into().unwrap(); + unpack64(&in_buf[self.byte_offset..], out_slice, num_bits); + self.byte_offset += 8 * num_bits; + i += 64; + } + } + _ => unreachable!(), + } + + // Try to read smaller batches if possible + if size_of::() > 4 && values_to_read - i >= 32 && num_bits <= 32 { + let mut out_buf = [0_u32; 32]; + unpack32(&in_buf[self.byte_offset..], &mut out_buf, num_bits); + self.byte_offset += 4 * num_bits; + + for out in out_buf { + // Zero-allocate buffer + let mut out_bytes = T::Buffer::default(); + out_bytes.as_mut()[..4].copy_from_slice(&out.to_le_bytes()); + batch[i] = T::from_le_bytes(out_bytes); + i += 1; + } + } + + if size_of::() > 2 && values_to_read - i >= 16 && num_bits <= 16 { + let mut out_buf = [0_u16; 16]; + unpack16(&in_buf[self.byte_offset..], &mut out_buf, num_bits); + self.byte_offset += 2 * num_bits; + + for out in out_buf { + // Zero-allocate buffer + let mut out_bytes = T::Buffer::default(); + out_bytes.as_mut()[..2].copy_from_slice(&out.to_le_bytes()); + batch[i] = T::from_le_bytes(out_bytes); + i += 1; + } } - assert!(values_to_read - i < 32); + if size_of::() > 1 && values_to_read - i >= 8 && num_bits <= 8 { + let mut out_buf = [0_u8; 8]; + unpack8(&in_buf[self.byte_offset..], &mut out_buf, num_bits); + self.byte_offset += num_bits; + + for out in out_buf { + // Zero-allocate buffer + let mut out_bytes = T::Buffer::default(); + out_bytes.as_mut()[..1].copy_from_slice(&out.to_le_bytes()); + batch[i] = T::from_le_bytes(out_bytes); + i += 1; + } + } self.reload_buffer_values(); + + // Read any trailing values while i < values_to_read { - batch[i] = self + let value = self .get_value(num_bits) .expect("expected to have more data"); + batch[i] = value; i += 1; } @@ -645,8 +568,7 @@ impl BitReader { // First align bit offset to byte offset if self.bit_offset != 0 { while values_skipped < num_values && self.bit_offset != 0 { - self - .skip_value(num_bits); + self.skip_value(num_bits); values_skipped += 1; } } @@ -656,7 +578,6 @@ impl BitReader { values_skipped += 32; } - assert!(num_values - values_skipped < 32); self.reload_buffer_values(); @@ -696,14 +617,14 @@ impl BitReader { /// Returns `Some` if there's enough bytes left to form a value of `T`. /// Otherwise `None`. pub fn get_aligned(&mut self, num_bytes: usize) -> Option { - let bytes_read = ceil(self.bit_offset as i64, 8) as usize; + let bytes_read = ceil(self.bit_offset, 8); if self.byte_offset + bytes_read + num_bytes > self.total_bytes { return None; } // Advance byte_offset to next unread byte and read num_bytes self.byte_offset += bytes_read; - let v = read_num_bytes!(T, num_bytes, self.buffer.data()[self.byte_offset..]); + let v = read_num_bytes::(num_bytes, &self.buffer.data()[self.byte_offset..]); self.byte_offset += num_bytes; // Reset buffered_values @@ -754,7 +675,7 @@ impl BitReader { fn reload_buffer_values(&mut self) { let bytes_to_read = cmp::min(self.total_bytes - self.byte_offset, 8); self.buffered_values = - read_num_bytes!(u64, bytes_to_read, self.buffer.data()[self.byte_offset..]); + read_num_bytes::(bytes_to_read, &self.buffer.data()[self.byte_offset..]); } } @@ -765,20 +686,11 @@ impl From> for BitReader { } } -/// Returns the nearest multiple of `factor` that is `>=` than `num`. Here `factor` must -/// be a power of 2. -/// -/// Copied from the arrow crate to make arrow optional -pub fn round_upto_power_of_2(num: usize, factor: usize) -> usize { - debug_assert!(factor > 0 && (factor & (factor - 1)) == 0); - (num + (factor - 1)) & !(factor - 1) -} - #[cfg(test)] mod tests { - use super::super::test_common::*; use super::*; + use crate::util::test_common::rand_gen::random_numbers; use rand::distributions::{Distribution, Standard}; use std::fmt::Debug; @@ -792,9 +704,9 @@ mod tests { assert_eq!(ceil(8, 8), 1); assert_eq!(ceil(9, 8), 2); assert_eq!(ceil(9, 9), 1); - assert_eq!(ceil(10000000000, 10), 1000000000); - assert_eq!(ceil(10, 10000000000), 1); - assert_eq!(ceil(10000000000, 1000000000), 10); + assert_eq!(ceil(10000000000_u64, 10), 1000000000); + assert_eq!(ceil(10_u64, 10000000000), 1); + assert_eq!(ceil(10000000000_u64, 1000000000), 10); } #[test] @@ -846,16 +758,16 @@ mod tests { fn test_bit_reader_skip() { let buffer = vec![255, 0]; let mut bit_reader = BitReader::from(buffer); - let skipped = bit_reader.skip(1,1); + let skipped = bit_reader.skip(1, 1); assert_eq!(skipped, 1); assert_eq!(bit_reader.get_value::(1), Some(1)); - let skipped = bit_reader.skip(2,2); + let skipped = bit_reader.skip(2, 2); assert_eq!(skipped, 2); assert_eq!(bit_reader.get_value::(2), Some(3)); - let skipped = bit_reader.skip(4,1); + let skipped = bit_reader.skip(4, 1); assert_eq!(skipped, 4); assert_eq!(bit_reader.get_value::(4), Some(0)); - let skipped = bit_reader.skip(1,1); + let skipped = bit_reader.skip(1, 1); assert_eq!(skipped, 0); } @@ -911,25 +823,6 @@ mod tests { assert_eq!(bit_reader.get_zigzag_vlq_int(), Some(-2)); } - #[test] - fn test_set_array_bit() { - let mut buffer = vec![0, 0, 0]; - set_array_bit(&mut buffer[..], 1); - assert_eq!(buffer, vec![2, 0, 0]); - set_array_bit(&mut buffer[..], 4); - assert_eq!(buffer, vec![18, 0, 0]); - unset_array_bit(&mut buffer[..], 1); - assert_eq!(buffer, vec![16, 0, 0]); - set_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 4, 0]); - set_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 4, 0]); - set_array_bit(&mut buffer[..], 11); - assert_eq!(buffer, vec![16, 12, 0]); - unset_array_bit(&mut buffer[..], 10); - assert_eq!(buffer, vec![16, 8, 0]); - } - #[test] fn test_num_required_bits() { assert_eq!(num_required_bits(0), 0); @@ -973,7 +866,7 @@ mod tests { #[test] fn test_skip() { let mut writer = BitWriter::new(5); - let old_offset = writer.skip(1).expect("skip() should return OK"); + let old_offset = writer.skip(1); writer.put_aligned(42, 4); writer.put_aligned_offset(0x10, 1, old_offset); let result = writer.consume(); @@ -981,16 +874,15 @@ mod tests { writer = BitWriter::new(4); let result = writer.skip(5); - assert!(result.is_err()); + assert_eq!(result, 0); + assert_eq!(writer.buffer(), &[0; 5]) } #[test] fn test_get_next_byte_ptr() { let mut writer = BitWriter::new(5); { - let first_byte = writer - .get_next_byte_ptr(1) - .expect("get_next_byte_ptr() should return OK"); + let first_byte = writer.get_next_byte_ptr(1); first_byte[0] = 0x10; } writer.put_aligned(42, 4); @@ -1017,8 +909,7 @@ mod tests { let mut writer = BitWriter::new(len); for i in 0..8 { - let result = writer.put_value(i % 2, 1); - assert!(result); + writer.put_value(i % 2, 1); } writer.flush(); @@ -1029,11 +920,10 @@ mod tests { // Write 00110011 for i in 0..8 { - let result = match i { + match i { 0 | 1 | 4 | 5 => writer.put_value(false as u64, 1), _ => writer.put_value(true as u64, 1), - }; - assert!(result); + } } writer.flush(); { @@ -1078,19 +968,13 @@ mod tests { fn test_put_value_rand_numbers(total: usize, num_bits: usize) { assert!(num_bits < 64); - let num_bytes = ceil(num_bits as i64, 8); + let num_bytes = ceil(num_bits, 8); let mut writer = BitWriter::new(num_bytes as usize * total); let values: Vec = random_numbers::(total) .iter() .map(|v| v & ((1 << num_bits) - 1)) .collect(); - (0..total).for_each(|i| { - assert!( - writer.put_value(values[i] as u64, num_bits), - "[{}]: put_value() failed", - i - ); - }); + (0..total).for_each(|i| writer.put_value(values[i] as u64, num_bits)); let mut reader = BitReader::from(writer.consume()); (0..total).for_each(|i| { @@ -1109,11 +993,12 @@ mod tests { fn test_get_batch() { const SIZE: &[usize] = &[1, 31, 32, 33, 128, 129]; for s in SIZE { - for i in 0..33 { + for i in 0..=64 { match i { 0..=8 => test_get_batch_helper::(*s, i), 9..=16 => test_get_batch_helper::(*s, i), - _ => test_get_batch_helper::(*s, i), + 17..=32 => test_get_batch_helper::(*s, i), + _ => test_get_batch_helper::(*s, i), } } } @@ -1123,22 +1008,25 @@ mod tests { where T: FromBytes + Default + Clone + Debug + Eq, { - assert!(num_bits <= 32); - let num_bytes = ceil(num_bits as i64, 8); + assert!(num_bits <= 64); + let num_bytes = ceil(num_bits, 8); let mut writer = BitWriter::new(num_bytes as usize * total); - let values: Vec = random_numbers::(total) + let mask = match num_bits { + 64 => u64::MAX, + _ => (1 << num_bits) - 1, + }; + + let values: Vec = random_numbers::(total) .iter() - .map(|v| v & ((1u64 << num_bits) - 1) as u32) + .map(|v| v & mask) .collect(); // Generic values used to check against actual values read from `get_batch`. let expected_values: Vec = values.iter().map(|v| from_ne_slice(v.as_bytes())).collect(); - (0..total).for_each(|i| { - assert!(writer.put_value(values[i] as u64, num_bits)); - }); + (0..total).for_each(|i| writer.put_value(values[i] as u64, num_bits)); let buf = writer.consume(); let mut reader = BitReader::from(buf); @@ -1147,9 +1035,12 @@ mod tests { assert_eq!(values_read, values.len()); for i in 0..batch.len() { assert_eq!( - batch[i], expected_values[i], - "num_bits = {}, index = {}", - num_bits, i + batch[i], + expected_values[i], + "max_num_bits = {}, num_bits = {}, index = {}", + size_of::() * 8, + num_bits, + i ); } } @@ -1175,7 +1066,7 @@ mod tests { assert!(total % 2 == 0); let aligned_value_byte_width = std::mem::size_of::(); - let value_byte_width = ceil(num_bits as i64, 8) as usize; + let value_byte_width = ceil(num_bits, 8); let mut writer = BitWriter::new((total / 2) * (aligned_value_byte_width + value_byte_width)); let values: Vec = random_numbers::(total / 2) @@ -1187,17 +1078,9 @@ mod tests { for i in 0..total { let j = i / 2; if i % 2 == 0 { - assert!( - writer.put_value(values[j] as u64, num_bits), - "[{}]: put_value() failed", - i - ); + writer.put_value(values[j] as u64, num_bits); } else { - assert!( - writer.put_aligned::(aligned_values[j], aligned_value_byte_width), - "[{}]: put_aligned() failed", - i - ); + writer.put_aligned::(aligned_values[j], aligned_value_byte_width) } } @@ -1231,13 +1114,7 @@ mod tests { let total = 64; let mut writer = BitWriter::new(total * 32); let values = random_numbers::(total); - (0..total).for_each(|i| { - assert!( - writer.put_vlq_int(values[i] as u64), - "[{}]; put_vlq_int() failed", - i - ); - }); + (0..total).for_each(|i| writer.put_vlq_int(values[i] as u64)); let mut reader = BitReader::from(writer.consume()); (0..total).for_each(|i| { @@ -1257,13 +1134,7 @@ mod tests { let total = 64; let mut writer = BitWriter::new(total * 32); let values = random_numbers::(total); - (0..total).for_each(|i| { - assert!( - writer.put_zigzag_vlq_int(values[i] as i64), - "[{}]; put_zigzag_vlq_int() failed", - i - ); - }); + (0..total).for_each(|i| writer.put_zigzag_vlq_int(values[i] as i64)); let mut reader = BitReader::from(writer.consume()); (0..total).for_each(|i| { diff --git a/parquet/src/util/cursor.rs b/parquet/src/util/cursor.rs deleted file mode 100644 index 706724dbf52a..000000000000 --- a/parquet/src/util/cursor.rs +++ /dev/null @@ -1,284 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use crate::util::io::TryClone; -use std::io::{self, Cursor, Error, ErrorKind, Read, Seek, SeekFrom, Write}; -use std::sync::{Arc, Mutex}; -use std::{cmp, fmt}; - -/// This is object to use if your file is already in memory. -/// The sliceable cursor is similar to std::io::Cursor, except that it makes it easy to create "cursor slices". -/// To achieve this, it uses Arc instead of shared references. Indeed reference fields are painful -/// because the lack of Generic Associated Type implies that you would require complex lifetime propagation when -/// returning such a cursor. -#[allow(clippy::rc_buffer)] -#[deprecated = "use bytes::Bytes instead"] -pub struct SliceableCursor { - inner: Arc>, - start: u64, - length: usize, - pos: u64, -} - -#[allow(deprecated)] -impl fmt::Debug for SliceableCursor { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("SliceableCursor") - .field("start", &self.start) - .field("length", &self.length) - .field("pos", &self.pos) - .field("inner.len", &self.inner.len()) - .finish() - } -} - -#[allow(deprecated)] -impl SliceableCursor { - pub fn new(content: impl Into>>) -> Self { - let inner = content.into(); - let size = inner.len(); - SliceableCursor { - inner, - start: 0, - pos: 0, - length: size, - } - } - - /// Create a slice cursor using the same data as a current one. - pub fn slice(&self, start: u64, length: usize) -> io::Result { - let new_start = self.start + start; - if new_start >= self.inner.len() as u64 - || new_start as usize + length > self.inner.len() - { - return Err(Error::new(ErrorKind::InvalidInput, "out of bound")); - } - Ok(SliceableCursor { - inner: Arc::clone(&self.inner), - start: new_start, - pos: new_start, - length, - }) - } - - fn remaining_slice(&self) -> &[u8] { - let end = self.start as usize + self.length; - let offset = cmp::min(self.pos, end as u64) as usize; - &self.inner[offset..end] - } - - /// Get the length of the current cursor slice - pub fn len(&self) -> u64 { - self.length as u64 - } - - /// return true if the cursor is empty (self.len() == 0) - pub fn is_empty(&self) -> bool { - self.len() == 0 - } -} - -/// Implementation inspired by std::io::Cursor -#[allow(deprecated)] -impl Read for SliceableCursor { - fn read(&mut self, buf: &mut [u8]) -> io::Result { - let n = Read::read(&mut self.remaining_slice(), buf)?; - self.pos += n as u64; - Ok(n) - } -} - -#[allow(deprecated)] -impl Seek for SliceableCursor { - fn seek(&mut self, pos: SeekFrom) -> io::Result { - let new_pos = match pos { - SeekFrom::Start(pos) => pos as i64, - SeekFrom::End(pos) => self.inner.len() as i64 + pos as i64, - SeekFrom::Current(pos) => self.pos as i64 + pos as i64, - }; - - if new_pos < 0 { - Err(Error::new( - ErrorKind::InvalidInput, - format!( - "Request out of bounds: cur position {} + seek {:?} < 0: {}", - self.pos, pos, new_pos - ), - )) - } else if new_pos >= self.inner.len() as i64 { - Err(Error::new( - ErrorKind::InvalidInput, - format!( - "Request out of bounds: cur position {} + seek {:?} >= length {}: {}", - self.pos, - pos, - self.inner.len(), - new_pos - ), - )) - } else { - self.pos = new_pos as u64; - Ok(self.start) - } - } -} - -/// Use this type to write Parquet to memory rather than a file. -#[deprecated = "use Vec instead"] -#[derive(Debug, Default, Clone)] -pub struct InMemoryWriteableCursor { - buffer: Arc>>>, -} - -#[allow(deprecated)] -impl InMemoryWriteableCursor { - /// Consume this instance and return the underlying buffer as long as there are no other - /// references to this instance. - pub fn into_inner(self) -> Option> { - Arc::try_unwrap(self.buffer) - .ok() - .and_then(|mutex| mutex.into_inner().ok()) - .map(|cursor| cursor.into_inner()) - } - - /// Returns a clone of the underlying buffer - pub fn data(&self) -> Vec { - let inner = self.buffer.lock().unwrap(); - inner.get_ref().to_vec() - } - - /// Returns a length of the underlying buffer - pub fn len(&self) -> usize { - let inner = self.buffer.lock().unwrap(); - inner.get_ref().len() - } - - /// Returns true if the underlying buffer contains no elements - pub fn is_empty(&self) -> bool { - let inner = self.buffer.lock().unwrap(); - inner.get_ref().is_empty() - } -} - -#[allow(deprecated)] -impl TryClone for InMemoryWriteableCursor { - fn try_clone(&self) -> std::io::Result { - Ok(Self { - buffer: self.buffer.clone(), - }) - } -} - -#[allow(deprecated)] -impl Write for InMemoryWriteableCursor { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - let mut inner = self.buffer.lock().unwrap(); - inner.write(buf) - } - - fn flush(&mut self) -> std::io::Result<()> { - let mut inner = self.buffer.lock().unwrap(); - inner.flush() - } -} - -#[allow(deprecated)] -impl Seek for InMemoryWriteableCursor { - fn seek(&mut self, pos: SeekFrom) -> std::io::Result { - let mut inner = self.buffer.lock().unwrap(); - inner.seek(pos) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - /// Create a SliceableCursor of all u8 values in ascending order - #[allow(deprecated)] - fn get_u8_range() -> SliceableCursor { - let data: Vec = (0u8..=255).collect(); - SliceableCursor::new(data) - } - - /// Reads all the bytes in the slice and checks that it matches the u8 range from start to end_included - #[allow(deprecated)] - fn check_read_all(mut cursor: SliceableCursor, start: u8, end_included: u8) { - let mut target = vec![]; - let cursor_res = cursor.read_to_end(&mut target); - println!("{:?}", cursor_res); - assert!(cursor_res.is_ok(), "reading error"); - assert_eq!((end_included - start) as usize + 1, cursor_res.unwrap()); - assert_eq!((start..=end_included).collect::>(), target); - } - - #[test] - fn read_all_whole() { - let cursor = get_u8_range(); - check_read_all(cursor, 0, 255); - } - - #[test] - fn read_all_slice() { - let cursor = get_u8_range().slice(10, 10).expect("error while slicing"); - check_read_all(cursor, 10, 19); - } - - #[test] - fn seek_cursor_start() { - let mut cursor = get_u8_range(); - - cursor.seek(SeekFrom::Start(5)).unwrap(); - check_read_all(cursor, 5, 255); - } - - #[test] - fn seek_cursor_current() { - let mut cursor = get_u8_range(); - cursor.seek(SeekFrom::Start(10)).unwrap(); - cursor.seek(SeekFrom::Current(10)).unwrap(); - check_read_all(cursor, 20, 255); - } - - #[test] - fn seek_cursor_end() { - let mut cursor = get_u8_range(); - - cursor.seek(SeekFrom::End(-10)).unwrap(); - check_read_all(cursor, 246, 255); - } - - #[test] - fn seek_cursor_error_too_long() { - let mut cursor = get_u8_range(); - let res = cursor.seek(SeekFrom::Start(1000)); - let actual_error = res.expect_err("expected error").to_string(); - let expected_error = - "Request out of bounds: cur position 0 + seek Start(1000) >= length 256: 1000"; - assert_eq!(actual_error, expected_error); - } - - #[test] - fn seek_cursor_error_too_short() { - let mut cursor = get_u8_range(); - let res = cursor.seek(SeekFrom::End(-1000)); - let actual_error = res.expect_err("expected error").to_string(); - let expected_error = - "Request out of bounds: cur position 0 + seek End(-1000) < 0: -744"; - assert_eq!(actual_error, expected_error); - } -} diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index e64ae0179e69..e638237e06c5 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -18,7 +18,8 @@ use crate::data_type::AsBytes; use hashbrown::hash_map::RawEntryMut; use hashbrown::HashMap; -use std::hash::Hash; + +const DEFAULT_DEDUP_CAPACITY: usize = 4096; /// Storage trait for [`Interner`] pub trait Storage { @@ -34,6 +35,7 @@ pub trait Storage { } /// A generic value interner supporting various different [`Storage`] +#[derive(Debug, Default)] pub struct Interner { state: ahash::RandomState, @@ -52,14 +54,14 @@ impl Interner { pub fn new(storage: S) -> Self { Self { state: Default::default(), - dedup: Default::default(), + dedup: HashMap::with_capacity_and_hasher(DEFAULT_DEDUP_CAPACITY, ()), storage, } } /// Intern the value, returning the interned key, and if this was a new value pub fn intern(&mut self, value: &S::Value) -> S::Key { - let hash = compute_hash(&self.state, value); + let hash = self.state.hash_one(value.as_bytes()); let entry = self .dedup @@ -73,7 +75,7 @@ impl Interner { *entry .insert_with_hasher(hash, key, (), |key| { - compute_hash(&self.state, self.storage.get(*key)) + self.state.hash_one(self.storage.get(*key).as_bytes()) }) .0 } @@ -84,11 +86,9 @@ impl Interner { pub fn storage(&self) -> &S { &self.storage } -} -fn compute_hash(state: &ahash::RandomState, value: &T) -> u64 { - use std::hash::{BuildHasher, Hasher}; - let mut hasher = state.build_hasher(); - value.as_bytes().hash(&mut hasher); - hasher.finish() + /// Unwraps the inner storage + pub fn into_inner(self) -> S { + self.storage + } } diff --git a/parquet/src/util/io.rs b/parquet/src/util/io.rs index a7b5e73074c6..1fb92063e27c 100644 --- a/parquet/src/util/io.rs +++ b/parquet/src/util/io.rs @@ -18,8 +18,6 @@ use std::{cell::RefCell, cmp, fmt, io::*}; use crate::file::reader::Length; -#[allow(deprecated)] -use crate::file::writer::ParquetWriter; const DEFAULT_BUF_SIZE: usize = 8 * 1024; @@ -156,51 +154,6 @@ impl Length for FileSource { } } -/// Struct that represents `File` output stream with position tracking. -/// Used as a sink in file writer. -#[deprecated = "use TrackedWrite instead"] -#[allow(deprecated)] -pub struct FileSink { - buf: BufWriter, - // This is not necessarily position in the underlying file, - // but rather current position in the sink. - pos: u64, -} - -#[allow(deprecated)] -impl FileSink { - /// Creates new file sink. - /// Position is set to whatever position file has. - pub fn new(buf: &W) -> Self { - let mut owned_buf = buf.try_clone().unwrap(); - let pos = owned_buf.seek(SeekFrom::Current(0)).unwrap(); - Self { - buf: BufWriter::new(owned_buf), - pos, - } - } -} - -#[allow(deprecated)] -impl Write for FileSink { - fn write(&mut self, buf: &[u8]) -> Result { - let num_bytes = self.buf.write(buf)?; - self.pos += num_bytes as u64; - Ok(num_bytes) - } - - fn flush(&mut self) -> Result<()> { - self.buf.flush() - } -} - -#[allow(deprecated)] -impl Position for FileSink { - fn pos(&self) -> u64 { - self.pos - } -} - // Position implementation for Cursor to use in various tests. impl<'a> Position for Cursor<&'a mut Vec> { fn pos(&self) -> u64 { @@ -214,7 +167,7 @@ mod tests { use std::iter; - use crate::util::test_common::get_test_file; + use crate::util::test_common::file_util::get_test_file; #[test] fn test_io_read_fully() { @@ -277,30 +230,6 @@ mod tests { assert_eq!(buf, vec![b'P', b'A', b'R', b'1']); } - #[test] - #[allow(deprecated)] - fn test_io_write_with_pos() { - let mut file = tempfile::tempfile().unwrap(); - file.write_all(&[b'a', b'b', b'c']).unwrap(); - - // Write into sink - let mut sink = FileSink::new(&file); - assert_eq!(sink.pos(), 3); - - sink.write_all(&[b'd', b'e', b'f', b'g']).unwrap(); - assert_eq!(sink.pos(), 7); - - sink.flush().unwrap(); - assert_eq!(sink.pos(), file.seek(SeekFrom::Current(0)).unwrap()); - - // Read data using file chunk - let mut res = vec![0u8; 7]; - let mut chunk = - FileSource::new(&file, 0, file.metadata().unwrap().len() as usize); - chunk.read_exact(&mut res[..]).unwrap(); - assert_eq!(res, vec![b'a', b'b', b'c', b'd', b'e', b'f', b'g']); - } - #[test] fn test_io_large_read() { // Generate repeated 'abcdef' pattern and write it into a file diff --git a/parquet/src/util/mod.rs b/parquet/src/util/mod.rs index 01ac39116dc7..d8ad739dbdb4 100644 --- a/parquet/src/util/mod.rs +++ b/parquet/src/util/mod.rs @@ -19,8 +19,7 @@ pub mod io; pub mod memory; #[macro_use] pub mod bit_util; -mod bit_packing; -pub mod cursor; +mod bit_pack; pub(crate) mod interner; pub(crate) mod page_util; #[cfg(any(test, feature = "test_common"))] diff --git a/parquet/src/util/page_util.rs b/parquet/src/util/page_util.rs index 5cdcf7535c63..7716b71167fb 100644 --- a/parquet/src/util/page_util.rs +++ b/parquet/src/util/page_util.rs @@ -25,7 +25,8 @@ use crate::file::reader::ChunkReader; /// Use column chunk's offset index to get the `page_num` page row count. pub(crate) fn calculate_row_count(indexes: &[PageLocation], page_num: usize, total_row_count: i64) -> Result { if page_num == indexes.len() - 1 { - Ok((total_row_count - indexes[page_num].first_row_index + 1) as usize) + // first_row_index start with 0, so no need to plus one additional. + Ok((total_row_count - indexes[page_num].first_row_index) as usize) } else { Ok((indexes[page_num + 1].first_row_index - indexes[page_num].first_row_index) as usize) } @@ -52,3 +53,44 @@ pub(crate) fn get_pages_readable_slices>(col } Ok((page_readers, has_dictionary_page)) } + +#[cfg(test)] +mod tests { + use super::*; + + /** + parquet-tools meta ./test.parquet got: + + file schema: test_schema + -------------------------------------------------------------------------------- + leaf: REQUIRED INT64 R:0 D: + + row group 1: RC:256 TS:2216 OFFSET:4 + -------------------------------------------------------------------------------- + leaf: INT64 UNCOMPRESSED DO:0 FPO:4 SZ:2216/2216/1.00 VC:256 ENC:PLAIN,RLE ST:[min: 0, max: 255, num_nulls not defined + + parquet-tools column-index -c leaf ./test.parquet got: + + offset index for column leaf: + offset compressed size first row index + page-0 4 554 0 + page-1 558 554 64 + page-2 1112 554 128 + page-3 1666 554 192 + + **/ + #[test] + fn test_calculate_row_count() { + let total_row_count = 256; + let mut indexes = vec![]; + indexes.push(PageLocation::new(4, 554, 0)); + indexes.push(PageLocation::new(558, 554, 64)); + indexes.push(PageLocation::new(1112, 554, 128)); + indexes.push(PageLocation::new(1666, 554, 192)); + for i in 0..4 { + // each page should has 64 rows. + assert_eq!(64, calculate_row_count(indexes.as_slice(), i, total_row_count).unwrap()); + } + + } +} diff --git a/parquet/src/util/test_common/mod.rs b/parquet/src/util/test_common/mod.rs index f0beb16ca954..504219ecae19 100644 --- a/parquet/src/util/test_common/mod.rs +++ b/parquet/src/util/test_common/mod.rs @@ -15,17 +15,10 @@ // specific language governing permissions and limitations // under the License. -pub mod file_util; pub mod page_util; -pub mod rand_gen; - -pub use self::rand_gen::random_bools; -pub use self::rand_gen::random_bytes; -pub use self::rand_gen::random_numbers; -pub use self::rand_gen::random_numbers_range; -pub use self::rand_gen::RandGen; -pub use self::file_util::get_test_file; -pub use self::file_util::get_test_path; +#[cfg(test)] +pub mod file_util; -pub use self::page_util::make_pages; +#[cfg(test)] +pub mod rand_gen; \ No newline at end of file diff --git a/parquet/src/util/test_common/page_util.rs b/parquet/src/util/test_common/page_util.rs index f56eaf85e636..243fb6f8b897 100644 --- a/parquet/src/util/test_common/page_util.rs +++ b/parquet/src/util/test_common/page_util.rs @@ -16,18 +16,15 @@ // under the License. use crate::basic::Encoding; -use crate::column::page::{PageMetadata, PageReader}; use crate::column::page::{Page, PageIterator}; +use crate::column::page::{PageMetadata, PageReader}; use crate::data_type::DataType; -use crate::encodings::encoding::{get_encoder, DictEncoder, Encoder}; -use crate::encodings::levels::max_buffer_size; +use crate::encodings::encoding::{get_encoder, Encoder}; use crate::encodings::levels::LevelEncoder; use crate::errors::Result; use crate::schema::types::{ColumnDescPtr, SchemaDescPtr}; use crate::util::memory::ByteBufferPtr; -use crate::util::test_common::random_numbers_range; -use rand::distributions::uniform::SampleUniform; -use std::collections::VecDeque; +use std::iter::Peekable; use std::mem; pub trait DataPageBuilder { @@ -45,7 +42,6 @@ pub trait DataPageBuilder { /// - consume() /// in order to populate and obtain a data page. pub struct DataPageBuilderImpl { - desc: ColumnDescPtr, encoding: Option, num_values: u32, buffer: Vec, @@ -58,9 +54,8 @@ impl DataPageBuilderImpl { // `num_values` is the number of non-null values to put in the data page. // `datapage_v2` flag is used to indicate if the generated data page should use V2 // format or not. - pub fn new(desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { + pub fn new(_desc: ColumnDescPtr, num_values: u32, datapage_v2: bool) -> Self { DataPageBuilderImpl { - desc, encoding: None, num_values, buffer: vec![], @@ -75,10 +70,9 @@ impl DataPageBuilderImpl { if max_level <= 0 { return 0; } - let size = max_buffer_size(Encoding::RLE, max_level, levels.len()); - let mut level_encoder = LevelEncoder::v1(Encoding::RLE, max_level, vec![0; size]); - level_encoder.put(levels).expect("put() should be OK"); - let encoded_levels = level_encoder.consume().expect("consume() should be OK"); + let mut level_encoder = LevelEncoder::v1(Encoding::RLE, max_level, levels.len()); + level_encoder.put(levels); + let encoded_levels = level_encoder.consume(); // Actual encoded bytes (without length offset) let encoded_bytes = &encoded_levels[mem::size_of::()..]; if self.datapage_v2 { @@ -113,8 +107,7 @@ impl DataPageBuilder for DataPageBuilderImpl { ); self.encoding = Some(encoding); let mut encoder: Box> = - get_encoder::(self.desc.clone(), encoding) - .expect("get_encoder() should be OK"); + get_encoder::(encoding).expect("get_encoder() should be OK"); encoder.put(values).expect("put() should be OK"); let encoded_values = encoder .flush_buffer() @@ -135,8 +128,8 @@ impl DataPageBuilder for DataPageBuilderImpl { encoding: self.encoding.unwrap(), num_nulls: 0, /* set to dummy value - don't need this when reading * data page */ - num_rows: self.num_values, /* also don't need this when reading - * data page */ + num_rows: self.num_values, /* num_rows only needs in skip_records, now we not support skip REPEATED field, + * so we can assume num_values == num_rows */ def_levels_byte_len: self.def_levels_byte_len, rep_levels_byte_len: self.rep_levels_byte_len, is_compressed: false, @@ -157,13 +150,13 @@ impl DataPageBuilder for DataPageBuilderImpl { /// A utility page reader which stores pages in memory. pub struct InMemoryPageReader> { - page_iter: P, + page_iter: Peekable

, } impl> InMemoryPageReader

{ pub fn new(pages: impl IntoIterator) -> Self { Self { - page_iter: pages.into_iter(), + page_iter: pages.into_iter().peekable(), } } } @@ -174,11 +167,29 @@ impl + Send> PageReader for InMemoryPageReader

{ } fn peek_next_page(&mut self) -> Result> { - unimplemented!() + if let Some(x) = self.page_iter.peek() { + match x { + Page::DataPage { num_values, .. } => Ok(Some(PageMetadata { + num_rows: *num_values as usize, + is_dict: false, + })), + Page::DataPageV2 { num_rows, .. } => Ok(Some(PageMetadata { + num_rows: *num_rows as usize, + is_dict: false, + })), + Page::DictionaryPage { .. } => Ok(Some(PageMetadata { + num_rows: 0, + is_dict: true, + })), + } + } else { + Ok(None) + } } fn skip_next_page(&mut self) -> Result<()> { - unimplemented!() + self.page_iter.next(); + Ok(()) } } @@ -231,88 +242,3 @@ impl> + Send> PageIterator for InMemoryPageIterator Ok(self.column_desc.clone()) } } - -pub fn make_pages( - desc: ColumnDescPtr, - encoding: Encoding, - num_pages: usize, - levels_per_page: usize, - min: T::T, - max: T::T, - def_levels: &mut Vec, - rep_levels: &mut Vec, - values: &mut Vec, - pages: &mut VecDeque, - use_v2: bool, -) where - T::T: PartialOrd + SampleUniform + Copy, -{ - let mut num_values = 0; - let max_def_level = desc.max_def_level(); - let max_rep_level = desc.max_rep_level(); - - let mut dict_encoder = DictEncoder::::new(desc.clone()); - - for i in 0..num_pages { - let mut num_values_cur_page = 0; - let level_range = i * levels_per_page..(i + 1) * levels_per_page; - - if max_def_level > 0 { - random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); - for dl in &def_levels[level_range.clone()] { - if *dl == max_def_level { - num_values_cur_page += 1; - } - } - } else { - num_values_cur_page = levels_per_page; - } - if max_rep_level > 0 { - random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); - } - random_numbers_range(num_values_cur_page, min, max, values); - - // Generate the current page - - let mut pb = - DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); - if max_rep_level > 0 { - pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); - } - if max_def_level > 0 { - pb.add_def_levels(max_def_level, &def_levels[level_range]); - } - - let value_range = num_values..num_values + num_values_cur_page; - match encoding { - Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { - let _ = dict_encoder.put(&values[value_range.clone()]); - let indices = dict_encoder - .write_indices() - .expect("write_indices() should be OK"); - pb.add_indices(indices); - } - Encoding::PLAIN => { - pb.add_values::(encoding, &values[value_range]); - } - enc => panic!("Unexpected encoding {}", enc), - } - - let data_page = pb.consume(); - pages.push_back(data_page); - num_values += num_values_cur_page; - } - - if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { - let dict = dict_encoder - .write_dict() - .expect("write_dict() should be OK"); - let dict_page = Page::DictionaryPage { - buf: dict, - num_values: dict_encoder.num_entries() as u32, - encoding: Encoding::RLE_DICTIONARY, - is_sorted: false, - }; - pages.push_front(dict_page); - } -} diff --git a/parquet/src/util/test_common/rand_gen.rs b/parquet/src/util/test_common/rand_gen.rs index d9c256577684..4e54aa7999cf 100644 --- a/parquet/src/util/test_common/rand_gen.rs +++ b/parquet/src/util/test_common/rand_gen.rs @@ -15,13 +15,19 @@ // specific language governing permissions and limitations // under the License. +use crate::basic::Encoding; +use crate::column::page::Page; use rand::{ distributions::{uniform::SampleUniform, Distribution, Standard}, thread_rng, Rng, }; +use std::collections::VecDeque; use crate::data_type::*; +use crate::encodings::encoding::{DictEncoder, Encoder}; +use crate::schema::types::ColumnDescPtr; use crate::util::memory::ByteBufferPtr; +use crate::util::{DataPageBuilder, DataPageBuilderImpl}; /// Random generator of data type `T` values and sequences. pub trait RandGen { @@ -106,15 +112,6 @@ pub fn random_bytes(n: usize) -> Vec { result } -pub fn random_bools(n: usize) -> Vec { - let mut result = vec![]; - let mut rng = thread_rng(); - for _ in 0..n { - result.push(rng.gen::()); - } - result -} - pub fn random_numbers(n: usize) -> Vec where Standard: Distribution, @@ -132,3 +129,89 @@ where result.push(rng.gen_range(low..high)); } } + +#[allow(clippy::too_many_arguments)] +pub fn make_pages( + desc: ColumnDescPtr, + encoding: Encoding, + num_pages: usize, + levels_per_page: usize, + min: T::T, + max: T::T, + def_levels: &mut Vec, + rep_levels: &mut Vec, + values: &mut Vec, + pages: &mut VecDeque, + use_v2: bool, +) where + T::T: PartialOrd + SampleUniform + Copy, +{ + let mut num_values = 0; + let max_def_level = desc.max_def_level(); + let max_rep_level = desc.max_rep_level(); + + let mut dict_encoder = DictEncoder::::new(desc.clone()); + + for i in 0..num_pages { + let mut num_values_cur_page = 0; + let level_range = i * levels_per_page..(i + 1) * levels_per_page; + + if max_def_level > 0 { + random_numbers_range(levels_per_page, 0, max_def_level + 1, def_levels); + for dl in &def_levels[level_range.clone()] { + if *dl == max_def_level { + num_values_cur_page += 1; + } + } + } else { + num_values_cur_page = levels_per_page; + } + if max_rep_level > 0 { + random_numbers_range(levels_per_page, 0, max_rep_level + 1, rep_levels); + } + random_numbers_range(num_values_cur_page, min, max, values); + + // Generate the current page + + let mut pb = + DataPageBuilderImpl::new(desc.clone(), num_values_cur_page as u32, use_v2); + if max_rep_level > 0 { + pb.add_rep_levels(max_rep_level, &rep_levels[level_range.clone()]); + } + if max_def_level > 0 { + pb.add_def_levels(max_def_level, &def_levels[level_range]); + } + + let value_range = num_values..num_values + num_values_cur_page; + match encoding { + Encoding::PLAIN_DICTIONARY | Encoding::RLE_DICTIONARY => { + let _ = dict_encoder.put(&values[value_range.clone()]); + let indices = dict_encoder + .write_indices() + .expect("write_indices() should be OK"); + pb.add_indices(indices); + } + Encoding::PLAIN => { + pb.add_values::(encoding, &values[value_range]); + } + enc => panic!("Unexpected encoding {}", enc), + } + + let data_page = pb.consume(); + pages.push_back(data_page); + num_values += num_values_cur_page; + } + + if encoding == Encoding::PLAIN_DICTIONARY || encoding == Encoding::RLE_DICTIONARY { + let dict = dict_encoder + .write_dict() + .expect("write_dict() should be OK"); + let dict_page = Page::DictionaryPage { + buf: dict, + num_values: dict_encoder.num_entries() as u32, + encoding: Encoding::RLE_DICTIONARY, + is_sorted: false, + }; + pages.push_front(dict_page); + } +} diff --git a/parquet_derive/Cargo.toml b/parquet_derive/Cargo.toml index 16e19df57af6..3f586de6928a 100644 --- a/parquet_derive/Cargo.toml +++ b/parquet_derive/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive" -version = "19.0.0" +version = "20.0.0" license = "Apache-2.0" description = "Derive macros for the Rust implementation of Apache Parquet" homepage = "https://github.com/apache/arrow-rs" @@ -35,4 +35,4 @@ proc-macro = true proc-macro2 = { version = "1.0", default-features = false } quote = { version = "1.0", default-features = false } syn = { version = "1.0", default-features = false } -parquet = { path = "../parquet", version = "19.0.0" } +parquet = { path = "../parquet", version = "20.0.0" } diff --git a/parquet_derive/README.md b/parquet_derive/README.md index 9f35c064a776..30d5e339f26c 100644 --- a/parquet_derive/README.md +++ b/parquet_derive/README.md @@ -32,8 +32,8 @@ Add this to your Cargo.toml: ```toml [dependencies] -parquet = "19.0.0" -parquet_derive = "19.0.0" +parquet = "20.0.0" +parquet_derive = "20.0.0" ``` and this to your crate root: diff --git a/parquet_derive_test/Cargo.toml b/parquet_derive_test/Cargo.toml index d03ea2359840..bf3e78b247ec 100644 --- a/parquet_derive_test/Cargo.toml +++ b/parquet_derive_test/Cargo.toml @@ -17,7 +17,7 @@ [package] name = "parquet_derive_test" -version = "19.0.0" +version = "20.0.0" license = "Apache-2.0" description = "Integration test package for parquet-derive" homepage = "https://github.com/apache/arrow-rs" @@ -29,6 +29,6 @@ publish = false rust-version = "1.62" [dependencies] -parquet = { path = "../parquet", version = "19.0.0", default-features = false } -parquet_derive = { path = "../parquet_derive", version = "19.0.0", default-features = false } +parquet = { path = "../parquet", version = "20.0.0", default-features = false } +parquet_derive = { path = "../parquet_derive", version = "20.0.0", default-features = false } chrono = { version="0.4.19", default-features = false, features = [ "clock" ] }