diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 666d8844a80..e27361ab263 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,13 +22,13 @@ on: default: nightly concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} cancel-in-progress: true jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,9 +69,10 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} + build-2_28-wheels: "true" branch: ${{ inputs.branch }} sha: ${{ inputs.sha }} date: ${{ inputs.date }} @@ -79,7 +80,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +90,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: ${{ inputs.build_type || 'branch' }} @@ -100,7 +101,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 23956a02fbd..31e78f82a62 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -6,6 +6,6 @@ jobs: triage: runs-on: ubuntu-latest steps: - - uses: actions/labeler@main + - uses: actions/labeler@v4 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index abf5fcf2f33..40cf0dcd2c1 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -30,34 +30,34 @@ jobs: #- pandas-tests-diff #- pandas-tests-diff-comment secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12 checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -65,14 +65,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -82,7 +82,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -92,7 +92,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -102,21 +102,22 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: build_type: pull-request + build-2_28-wheels: "true" script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request @@ -124,7 +125,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request @@ -132,7 +133,7 @@ jobs: unit-tests-cudf-pandas: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request @@ -141,7 +142,7 @@ jobs: # run the Pandas unit tests using PR branch needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.] build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 49a9c73d026..0d4401160e1 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -36,7 +36,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: nightly @@ -97,9 +97,8 @@ jobs: sha: ${{ inputs.sha }} script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: - needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: build_type: nightly branch: ${{ inputs.branch }} @@ -109,7 +108,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.10 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12 with: matrix_filter: map(select(.ARCH == "amd64")) | max_by(.CUDA_VER) | [.] build_type: nightly diff --git a/CHANGELOG.md b/CHANGELOG.md index ecd547ab5b3..3cb6caa25ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,140 @@ +# cuDF 23.12.00 (6 Dec 2023) + +## 🚨 Breaking Changes + +- Raise error in `reindex` when `index` is not unique ([#14400](https://github.com/rapidsai/cudf/pull/14400)) [@galipremsagar](https://github.com/galipremsagar) +- Expose stream parameter to get_json_object API ([#14297](https://github.com/rapidsai/cudf/pull/14297)) [@davidwendt](https://github.com/davidwendt) +- Refactor cudf_kafka to use skbuild ([#14292](https://github.com/rapidsai/cudf/pull/14292)) [@jdye64](https://github.com/jdye64) +- Expose stream parameter in public strings convert APIs ([#14255](https://github.com/rapidsai/cudf/pull/14255)) [@davidwendt](https://github.com/davidwendt) +- Upgrade to nvCOMP 3.0.4 ([#13815](https://github.com/rapidsai/cudf/pull/13815)) [@vuule](https://github.com/vuule) + +## 🐛 Bug Fixes + +- Update actions/labeler to v4 ([#14562](https://github.com/rapidsai/cudf/pull/14562)) [@raydouglass](https://github.com/raydouglass) +- Fix data corruption when skipping rows ([#14557](https://github.com/rapidsai/cudf/pull/14557)) [@etseidl](https://github.com/etseidl) +- Fix function name typo in `cudf.pandas` profiler ([#14514](https://github.com/rapidsai/cudf/pull/14514)) [@galipremsagar](https://github.com/galipremsagar) +- Fix intermediate type checking in expression parsing ([#14445](https://github.com/rapidsai/cudf/pull/14445)) [@vyasr](https://github.com/vyasr) +- Forward merge `branch-23.10` into `branch-23.12` ([#14435](https://github.com/rapidsai/cudf/pull/14435)) [@raydouglass](https://github.com/raydouglass) +- Remove needs: wheel-build-cudf. ([#14427](https://github.com/rapidsai/cudf/pull/14427)) [@bdice](https://github.com/bdice) +- Fix dask dependency in custreamz ([#14420](https://github.com/rapidsai/cudf/pull/14420)) [@vyasr](https://github.com/vyasr) +- Ensure nvbench initializes nvml context when built statically ([#14411](https://github.com/rapidsai/cudf/pull/14411)) [@robertmaynard](https://github.com/robertmaynard) +- Support java AST String literal with desired encoding ([#14402](https://github.com/rapidsai/cudf/pull/14402)) [@winningsix](https://github.com/winningsix) +- Raise error in `reindex` when `index` is not unique ([#14400](https://github.com/rapidsai/cudf/pull/14400)) [@galipremsagar](https://github.com/galipremsagar) +- Always build nvbench statically so we don't need to package it ([#14399](https://github.com/rapidsai/cudf/pull/14399)) [@robertmaynard](https://github.com/robertmaynard) +- Fix token-count logic in nvtext::tokenize_with_vocabulary ([#14393](https://github.com/rapidsai/cudf/pull/14393)) [@davidwendt](https://github.com/davidwendt) +- Fix as_column(pd.Timestamp/Timedelta, length=) not respecting length ([#14390](https://github.com/rapidsai/cudf/pull/14390)) [@mroeschke](https://github.com/mroeschke) +- cudf.pandas: cuDF subpath checking in module `__getattr__` ([#14388](https://github.com/rapidsai/cudf/pull/14388)) [@shwina](https://github.com/shwina) +- Fix and disable encoding for nanosecond statistics in ORC writer ([#14367](https://github.com/rapidsai/cudf/pull/14367)) [@vuule](https://github.com/vuule) +- Add the new manylinux builds to the build job ([#14351](https://github.com/rapidsai/cudf/pull/14351)) [@vyasr](https://github.com/vyasr) +- cudf jit parser now supports .pragma instructions with quotes ([#14348](https://github.com/rapidsai/cudf/pull/14348)) [@robertmaynard](https://github.com/robertmaynard) +- Fix overflow check in `cudf::merge` ([#14345](https://github.com/rapidsai/cudf/pull/14345)) [@divyegala](https://github.com/divyegala) +- Add cramjam ([#14344](https://github.com/rapidsai/cudf/pull/14344)) [@vyasr](https://github.com/vyasr) +- Enable `dask_cudf/io` pytests in CI ([#14338](https://github.com/rapidsai/cudf/pull/14338)) [@galipremsagar](https://github.com/galipremsagar) +- Temporarily avoid the current build of pydata-sphinx-theme ([#14332](https://github.com/rapidsai/cudf/pull/14332)) [@vyasr](https://github.com/vyasr) +- Fix host buffer access from device function in the Parquet reader ([#14328](https://github.com/rapidsai/cudf/pull/14328)) [@vuule](https://github.com/vuule) +- Run IO tests for Dask-cuDF ([#14327](https://github.com/rapidsai/cudf/pull/14327)) [@rjzamora](https://github.com/rjzamora) +- Fix logical type issues in the Parquet writer ([#14322](https://github.com/rapidsai/cudf/pull/14322)) [@vuule](https://github.com/vuule) +- Remove aws-sdk-pinning and revert to arrow 12.0.1 ([#14319](https://github.com/rapidsai/cudf/pull/14319)) [@vyasr](https://github.com/vyasr) +- test is_valid before reading column data ([#14318](https://github.com/rapidsai/cudf/pull/14318)) [@etseidl](https://github.com/etseidl) +- Fix gtest validity setting for TextTokenizeTest.Vocabulary ([#14312](https://github.com/rapidsai/cudf/pull/14312)) [@davidwendt](https://github.com/davidwendt) +- Fixes stack context for json lines format that recovers from invalid JSON lines ([#14309](https://github.com/rapidsai/cudf/pull/14309)) [@elstehle](https://github.com/elstehle) +- Downgrade to Arrow 12.0.0 for aws-sdk-cpp and fix cudf_kafka builds for new CI containers ([#14296](https://github.com/rapidsai/cudf/pull/14296)) [@vyasr](https://github.com/vyasr) +- fixing thread index overflow issue ([#14290](https://github.com/rapidsai/cudf/pull/14290)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Fix memset error in nvtext::edit_distance_matrix ([#14283](https://github.com/rapidsai/cudf/pull/14283)) [@davidwendt](https://github.com/davidwendt) +- Changes JSON reader's recovery option's behaviour to ignore all characters after a valid JSON record ([#14279](https://github.com/rapidsai/cudf/pull/14279)) [@elstehle](https://github.com/elstehle) +- Handle empty string correctly in Parquet statistics ([#14257](https://github.com/rapidsai/cudf/pull/14257)) [@etseidl](https://github.com/etseidl) +- Fixes behaviour for incomplete lines when `recover_with_nulls` is enabled ([#14252](https://github.com/rapidsai/cudf/pull/14252)) [@elstehle](https://github.com/elstehle) +- cudf::detail::pinned_allocator doesn't throw from `deallocate` ([#14251](https://github.com/rapidsai/cudf/pull/14251)) [@robertmaynard](https://github.com/robertmaynard) +- Fix strings replace for adjacent, identical multi-byte UTF-8 character targets ([#14235](https://github.com/rapidsai/cudf/pull/14235)) [@davidwendt](https://github.com/davidwendt) +- Fix the precision when converting a decimal128 column to an arrow array ([#14230](https://github.com/rapidsai/cudf/pull/14230)) [@jihoonson](https://github.com/jihoonson) +- Fixing parquet list of struct interpretation ([#13715](https://github.com/rapidsai/cudf/pull/13715)) [@hyperbolic2346](https://github.com/hyperbolic2346) + +## 📖 Documentation + +- Fix io reference in docs. ([#14452](https://github.com/rapidsai/cudf/pull/14452)) [@bdice](https://github.com/bdice) +- Update README ([#14374](https://github.com/rapidsai/cudf/pull/14374)) [@shwina](https://github.com/shwina) +- Example code for blog on new row comparators ([#13795](https://github.com/rapidsai/cudf/pull/13795)) [@divyegala](https://github.com/divyegala) + +## 🚀 New Features + +- Expose streams in public unary APIs ([#14342](https://github.com/rapidsai/cudf/pull/14342)) [@vyasr](https://github.com/vyasr) +- Add python tests for Parquet DELTA_BINARY_PACKED encoder ([#14316](https://github.com/rapidsai/cudf/pull/14316)) [@etseidl](https://github.com/etseidl) +- Update rapids-cmake functions to non-deprecated signatures ([#14265](https://github.com/rapidsai/cudf/pull/14265)) [@robertmaynard](https://github.com/robertmaynard) +- Expose streams in public null mask APIs ([#14263](https://github.com/rapidsai/cudf/pull/14263)) [@vyasr](https://github.com/vyasr) +- Expose streams in binaryop APIs ([#14187](https://github.com/rapidsai/cudf/pull/14187)) [@vyasr](https://github.com/vyasr) +- Add pylibcudf.Scalar that interoperates with Arrow scalars ([#14133](https://github.com/rapidsai/cudf/pull/14133)) [@vyasr](https://github.com/vyasr) +- Add decoder for DELTA_BYTE_ARRAY to Parquet reader ([#14101](https://github.com/rapidsai/cudf/pull/14101)) [@etseidl](https://github.com/etseidl) +- Add DELTA_BINARY_PACKED encoder for Parquet writer ([#14100](https://github.com/rapidsai/cudf/pull/14100)) [@etseidl](https://github.com/etseidl) +- Add BytePairEncoder class to cuDF ([#13891](https://github.com/rapidsai/cudf/pull/13891)) [@davidwendt](https://github.com/davidwendt) +- Upgrade to nvCOMP 3.0.4 ([#13815](https://github.com/rapidsai/cudf/pull/13815)) [@vuule](https://github.com/vuule) +- Use `pynvjitlink` for CUDA 12+ MVC ([#13650](https://github.com/rapidsai/cudf/pull/13650)) [@brandon-b-miller](https://github.com/brandon-b-miller) + +## 🛠️ Improvements + +- Build concurrency for nightly and merge triggers ([#14441](https://github.com/rapidsai/cudf/pull/14441)) [@bdice](https://github.com/bdice) +- Cleanup remaining usages of dask dependencies ([#14407](https://github.com/rapidsai/cudf/pull/14407)) [@galipremsagar](https://github.com/galipremsagar) +- Update to Arrow 14.0.1. ([#14387](https://github.com/rapidsai/cudf/pull/14387)) [@bdice](https://github.com/bdice) +- Remove Cython libcpp wrappers ([#14382](https://github.com/rapidsai/cudf/pull/14382)) [@vyasr](https://github.com/vyasr) +- Forward-merge branch-23.10 to branch-23.12 ([#14372](https://github.com/rapidsai/cudf/pull/14372)) [@bdice](https://github.com/bdice) +- Upgrade to arrow 14 ([#14371](https://github.com/rapidsai/cudf/pull/14371)) [@galipremsagar](https://github.com/galipremsagar) +- Fix a pytest typo in `test_kurt_skew_error` ([#14368](https://github.com/rapidsai/cudf/pull/14368)) [@galipremsagar](https://github.com/galipremsagar) +- Use new rapids-dask-dependency metapackage for managing dask versions ([#14364](https://github.com/rapidsai/cudf/pull/14364)) [@vyasr](https://github.com/vyasr) +- Change `nullable()` to `has_nulls()` in `cudf::detail::gather` ([#14363](https://github.com/rapidsai/cudf/pull/14363)) [@divyegala](https://github.com/divyegala) +- Split up scan_inclusive.cu to improve its compile time ([#14358](https://github.com/rapidsai/cudf/pull/14358)) [@davidwendt](https://github.com/davidwendt) +- Implement user_datasource_wrapper is_empty() and is_device_read_preferred(). ([#14357](https://github.com/rapidsai/cudf/pull/14357)) [@tpn](https://github.com/tpn) +- Added streams to CSV reader and writer api ([#14340](https://github.com/rapidsai/cudf/pull/14340)) [@shrshi](https://github.com/shrshi) +- Upgrade wheels to use arrow 13 ([#14339](https://github.com/rapidsai/cudf/pull/14339)) [@vyasr](https://github.com/vyasr) +- Rework nvtext::byte_pair_encoding API ([#14337](https://github.com/rapidsai/cudf/pull/14337)) [@davidwendt](https://github.com/davidwendt) +- Improve performance of nvtext::tokenize_with_vocabulary for long strings ([#14336](https://github.com/rapidsai/cudf/pull/14336)) [@davidwendt](https://github.com/davidwendt) +- Upgrade `arrow` to `13` ([#14330](https://github.com/rapidsai/cudf/pull/14330)) [@galipremsagar](https://github.com/galipremsagar) +- Expose stream parameter in public nvtext replace APIs ([#14329](https://github.com/rapidsai/cudf/pull/14329)) [@davidwendt](https://github.com/davidwendt) +- Drop `pyorc` dependency and use `pandas`/`pyarrow` instead ([#14323](https://github.com/rapidsai/cudf/pull/14323)) [@galipremsagar](https://github.com/galipremsagar) +- Avoid `pyarrow.fs` import for local storage ([#14321](https://github.com/rapidsai/cudf/pull/14321)) [@rjzamora](https://github.com/rjzamora) +- Unpin `dask` and `distributed` for `23.12` development ([#14320](https://github.com/rapidsai/cudf/pull/14320)) [@galipremsagar](https://github.com/galipremsagar) +- Expose stream parameter in public nvtext tokenize APIs ([#14317](https://github.com/rapidsai/cudf/pull/14317)) [@davidwendt](https://github.com/davidwendt) +- Added streams to JSON reader and writer api ([#14313](https://github.com/rapidsai/cudf/pull/14313)) [@shrshi](https://github.com/shrshi) +- Minor improvements in `source_info` ([#14308](https://github.com/rapidsai/cudf/pull/14308)) [@vuule](https://github.com/vuule) +- Forward-merge branch-23.10 to branch-23.12 ([#14307](https://github.com/rapidsai/cudf/pull/14307)) [@bdice](https://github.com/bdice) +- Add stream parameter to Set Operations (Public List APIs) ([#14305](https://github.com/rapidsai/cudf/pull/14305)) [@SurajAralihalli](https://github.com/SurajAralihalli) +- Expose stream parameter to get_json_object API ([#14297](https://github.com/rapidsai/cudf/pull/14297)) [@davidwendt](https://github.com/davidwendt) +- Sort dictionary data alphabetically in the ORC writer ([#14295](https://github.com/rapidsai/cudf/pull/14295)) [@vuule](https://github.com/vuule) +- Expose stream parameter in public strings filter APIs ([#14293](https://github.com/rapidsai/cudf/pull/14293)) [@davidwendt](https://github.com/davidwendt) +- Refactor cudf_kafka to use skbuild ([#14292](https://github.com/rapidsai/cudf/pull/14292)) [@jdye64](https://github.com/jdye64) +- Update `shared-action-workflows` references ([#14289](https://github.com/rapidsai/cudf/pull/14289)) [@AyodeAwe](https://github.com/AyodeAwe) +- Register ``partd`` encode dispatch in ``dask_cudf`` ([#14287](https://github.com/rapidsai/cudf/pull/14287)) [@rjzamora](https://github.com/rjzamora) +- Update versioning strategy ([#14285](https://github.com/rapidsai/cudf/pull/14285)) [@vyasr](https://github.com/vyasr) +- Move and rename byte-pair-encoding source files ([#14284](https://github.com/rapidsai/cudf/pull/14284)) [@davidwendt](https://github.com/davidwendt) +- Expose stream parameter in public strings combine APIs ([#14281](https://github.com/rapidsai/cudf/pull/14281)) [@davidwendt](https://github.com/davidwendt) +- Expose stream parameter in public strings contains APIs ([#14280](https://github.com/rapidsai/cudf/pull/14280)) [@davidwendt](https://github.com/davidwendt) +- Add stream parameter to List Sort and Filter APIs ([#14272](https://github.com/rapidsai/cudf/pull/14272)) [@SurajAralihalli](https://github.com/SurajAralihalli) +- Use branch-23.12 workflows. ([#14271](https://github.com/rapidsai/cudf/pull/14271)) [@bdice](https://github.com/bdice) +- Refactor LogicalType for Parquet ([#14264](https://github.com/rapidsai/cudf/pull/14264)) [@etseidl](https://github.com/etseidl) +- Centralize chunked reading code in the parquet reader to reader_impl_chunking.cu ([#14262](https://github.com/rapidsai/cudf/pull/14262)) [@nvdbaranec](https://github.com/nvdbaranec) +- Expose stream parameter in public strings replace APIs ([#14261](https://github.com/rapidsai/cudf/pull/14261)) [@davidwendt](https://github.com/davidwendt) +- Expose stream parameter in public strings APIs ([#14260](https://github.com/rapidsai/cudf/pull/14260)) [@davidwendt](https://github.com/davidwendt) +- Cleanup of namespaces in parquet code. ([#14259](https://github.com/rapidsai/cudf/pull/14259)) [@nvdbaranec](https://github.com/nvdbaranec) +- Make parquet schema index type consistent ([#14256](https://github.com/rapidsai/cudf/pull/14256)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Expose stream parameter in public strings convert APIs ([#14255](https://github.com/rapidsai/cudf/pull/14255)) [@davidwendt](https://github.com/davidwendt) +- Add in java bindings for DataSource ([#14254](https://github.com/rapidsai/cudf/pull/14254)) [@revans2](https://github.com/revans2) +- Reimplement `cudf::merge` for nested types without using comparators ([#14250](https://github.com/rapidsai/cudf/pull/14250)) [@divyegala](https://github.com/divyegala) +- Add stream parameter to List Manipulation and Operations APIs ([#14248](https://github.com/rapidsai/cudf/pull/14248)) [@SurajAralihalli](https://github.com/SurajAralihalli) +- Expose stream parameter in public strings split/partition APIs ([#14247](https://github.com/rapidsai/cudf/pull/14247)) [@davidwendt](https://github.com/davidwendt) +- Improve `contains_column` by invoking `contains_table` ([#14238](https://github.com/rapidsai/cudf/pull/14238)) [@PointKernel](https://github.com/PointKernel) +- Detect and report errors in Parquet header parsing ([#14237](https://github.com/rapidsai/cudf/pull/14237)) [@etseidl](https://github.com/etseidl) +- Normalizing offsets iterator ([#14234](https://github.com/rapidsai/cudf/pull/14234)) [@davidwendt](https://github.com/davidwendt) +- Forward merge `23.10` into `23.12` ([#14231](https://github.com/rapidsai/cudf/pull/14231)) [@galipremsagar](https://github.com/galipremsagar) +- Return error if BOOL8 column-type is used with integers-to-hex ([#14208](https://github.com/rapidsai/cudf/pull/14208)) [@davidwendt](https://github.com/davidwendt) +- Enable indexalator for device code ([#14206](https://github.com/rapidsai/cudf/pull/14206)) [@davidwendt](https://github.com/davidwendt) +- Marginally reduce memory footprint of joins ([#14197](https://github.com/rapidsai/cudf/pull/14197)) [@wence-](https://github.com/wence-) +- Add nvtx annotations to spilling-based data movement ([#14196](https://github.com/rapidsai/cudf/pull/14196)) [@wence-](https://github.com/wence-) +- Optimize ORC writer for decimal columns ([#14190](https://github.com/rapidsai/cudf/pull/14190)) [@vuule](https://github.com/vuule) +- Remove the use of volatile in ORC ([#14175](https://github.com/rapidsai/cudf/pull/14175)) [@vuule](https://github.com/vuule) +- Add `bytes_per_second` to distinct_count of stream_compaction nvbench. ([#14172](https://github.com/rapidsai/cudf/pull/14172)) [@Blonck](https://github.com/Blonck) +- Add `bytes_per_second` to transpose benchmark ([#14170](https://github.com/rapidsai/cudf/pull/14170)) [@Blonck](https://github.com/Blonck) +- cuDF: Build CUDA 12.0 ARM conda packages. ([#14112](https://github.com/rapidsai/cudf/pull/14112)) [@bdice](https://github.com/bdice) +- Add `bytes_per_second` to shift benchmark ([#13950](https://github.com/rapidsai/cudf/pull/13950)) [@Blonck](https://github.com/Blonck) +- Extract `debug_utilities.hpp/cu` from `column_utilities.hpp/cu` ([#13720](https://github.com/rapidsai/cudf/pull/13720)) [@ttnghia](https://github.com/ttnghia) + # cuDF 23.10.00 (11 Oct 2023) ## 🚨 Breaking Changes diff --git a/README.md b/README.md index 64c980d0cb3..677cfc89d52 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,62 @@ #
 cuDF - GPU DataFrames
-**NOTE:** For the latest stable [README.md](https://github.com/rapidsai/cudf/blob/main/README.md) ensure you are on the `main` branch. +## 📢 cuDF can now be used as a no-code-change accelerator for pandas! To learn more, see [here](https://rapids.ai/cudf-pandas/)! -## Resources - -- [cuDF Reference Documentation](https://docs.rapids.ai/api/cudf/stable/): Python API reference, tutorials, and topic guides. -- [libcudf Reference Documentation](https://docs.rapids.ai/api/libcudf/stable/): C/C++ CUDA library API reference. -- [Getting Started](https://rapids.ai/start.html): Instructions for installing cuDF. -- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate. -- [GitHub repository](https://github.com/rapidsai/cudf): Download the cuDF source code. -- [Issue tracker](https://github.com/rapidsai/cudf/issues): Report issues or request features. - -## Overview - -Built based on the [Apache Arrow](http://arrow.apache.org/) columnar memory format, cuDF is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data. +cuDF is a GPU DataFrame library for loading joining, aggregating, +filtering, and otherwise manipulating data. cuDF leverages +[libcudf](https://docs.rapids.ai/api/libcudf/stable/), a +blazing-fast C++/CUDA dataframe library and the [Apache +Arrow](https://arrow.apache.org/) columnar format to provide a +GPU-accelerated pandas API. -cuDF provides a pandas-like API that will be familiar to data engineers & data scientists, so they can use it to easily accelerate their workflows without going into the details of CUDA programming. +You can import `cudf` directly and use it like `pandas`: -For example, the following snippet downloads a CSV, then uses the GPU to parse it into rows and columns and run calculations: ```python -import cudf, requests +import cudf +import requests from io import StringIO url = "https://github.com/plotly/datasets/raw/master/tips.csv" -content = requests.get(url).content.decode('utf-8') +content = requests.get(url).content.decode("utf-8") tips_df = cudf.read_csv(StringIO(content)) -tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100 +tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100 # display average tip by dining party size -print(tips_df.groupby('size').tip_percentage.mean()) +print(tips_df.groupby("size").tip_percentage.mean()) ``` -Output: -``` -size -1 21.729201548727808 -2 16.571919173482897 -3 15.215685473711837 -4 14.594900639351332 -5 14.149548965142023 -6 15.622920072028379 -Name: tip_percentage, dtype: float64 -``` +Or, you can use cuDF as a no-code-change accelerator for pandas, using +[`cudf.pandas`](https://docs.rapids.ai/api/cudf/stable/cudf_pandas). +`cudf.pandas` supports 100% of the pandas API, utilizing cuDF for +supported operations and falling back to pandas when needed: -For additional examples, browse our complete [API documentation](https://docs.rapids.ai/api/cudf/stable/), or check out our more detailed [notebooks](https://github.com/rapidsai/notebooks-contrib). +```python +%load_ext cudf.pandas # pandas operations now use the GPU! -## Quick Start +import pandas as pd +import requests +from io import StringIO -Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you're running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF. +url = "https://github.com/plotly/datasets/raw/master/tips.csv" +content = requests.get(url).content.decode("utf-8") -## Installation +tips_df = pd.read_csv(StringIO(content)) +tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100 +# display average tip by dining party size +print(tips_df.groupby("size").tip_percentage.mean()) +``` + +## Resources + +- [Try cudf.pandas now](https://nvda.ws/rapids-cudf): Explore `cudf.pandas` on a free GPU enabled instance on Google Colab! +- [Install](https://rapids.ai/start.html): Instructions for installing cuDF and other [RAPIDS](https://rapids.ai) libraries. +- [cudf (Python) documentation](https://docs.rapids.ai/api/cudf/stable/) +- [libcudf (C++/CUDA) documentation](https://docs.rapids.ai/api/libcudf/stable/) +- [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate. + +## Installation ### CUDA/GPU requirements @@ -65,7 +70,7 @@ cuDF can be installed with conda (via [miniconda](https://conda.io/miniconda.htm ```bash conda install -c rapidsai -c conda-forge -c nvidia \ - cudf=23.10 python=3.10 cuda-version=11.8 + cudf=23.12 python=3.10 cuda-version=11.8 ``` We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD diff --git a/VERSION b/VERSION new file mode 100644 index 00000000000..a193fff41e8 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +23.12.00 diff --git a/build.sh b/build.sh index 2ad69712e5d..e5beb51dedf 100755 --- a/build.sh +++ b/build.sh @@ -369,7 +369,7 @@ fi # build cudf_kafka Python package if hasArg cudf_kafka; then cd ${REPODIR}/python/cudf_kafka - SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \ + SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \ SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \ python -m pip install --no-build-isolation --no-deps . fi diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh index 8b757fecf5a..f1ad8ee7778 100755 --- a/ci/build_cpp.sh +++ b/ci/build_cpp.sh @@ -9,10 +9,12 @@ export CMAKE_GENERATOR=Ninja rapids-print-env +version=$(rapids-generate-version) + rapids-logger "Begin cpp build" # With boa installed conda build forward to boa -rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ conda/recipes/libcudf rapids-upload-conda-to-s3 cpp diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 9149b5e6bfe..d5b0c9a5edb 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -25,7 +25,7 @@ rapids-mamba-retry install \ --channel "${PYTHON_CHANNEL}" \ libcudf cudf dask-cudf -export RAPIDS_VERSION_NUMBER="23.10" +export RAPIDS_VERSION_NUMBER="23.12" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build CPP docs" diff --git a/ci/build_python.sh b/ci/build_python.sh index 61f160b25f5..32fe7b6b3ce 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -9,6 +9,15 @@ export CMAKE_GENERATOR=Ninja rapids-print-env +package_dir="python" +version=$(rapids-generate-version) +commit=$(git rev-parse HEAD) + +echo "${version}" > VERSION +for package_name in cudf dask_cudf cudf_kafka custreamz; do + sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" ${package_dir}/${package_name}/${package_name}/_version.py +done + rapids-logger "Begin py build" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) @@ -16,24 +25,24 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) # TODO: Remove `--no-test` flag once importing on a CPU # node works correctly # With boa installed conda build forwards to the boa builder -rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ conda/recipes/cudf -rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/dask-cudf -rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/cudf_kafka -rapids-conda-retry mambabuild \ +RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \ --no-test \ --channel "${CPP_CHANNEL}" \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index a1d52c55b17..ae1d9c3fb1a 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -9,9 +9,8 @@ package_dir=$2 source rapids-configure-sccache source rapids-date-string -# Use gha-tools rapids-pip-wheel-version to generate wheel version then -# update the necessary files -version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})" +version=$(rapids-generate-version) +commit=$(git rev-parse HEAD) RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" @@ -22,8 +21,9 @@ PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}" # Patch project metadata files to include the CUDA version suffix and version override. pyproject_file="${package_dir}/pyproject.toml" -sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file} -sed -i "s/^name = .*/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} +sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} +echo "${version}" > VERSION +sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py" # For nightlies we want to ensure that we're pulling in alphas as well. The # easiest way to do so is to augment the spec with a constraint containing a @@ -36,6 +36,8 @@ fi if [[ ${package_name} == "dask_cudf" ]]; then sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} + sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file} + sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file} else sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} # ptxcompiler and cubinlinker aren't version constrained diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index 1b2285b5f22..456a3a289d1 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -7,20 +7,10 @@ package_dir="python/cudf" export SKBUILD_CONFIGURE_OPTIONS="-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF" -# Force a build using the latest version of the code before this PR -CUDF_BUILD_BRANCH=${1:-""} -WHEEL_NAME="cudf" -if [[ "${CUDF_BUILD_BRANCH}" == "main" ]]; then - MAIN_COMMIT=$(git merge-base HEAD origin/branch-23.10-xdf) - git checkout $MAIN_COMMIT - WHEEL_NAME="${WHEEL_NAME}_${CUDF_BUILD_BRANCH}" -fi +./ci/build_wheel.sh cudf ${package_dir} -./ci/build_wheel.sh ${WHEEL_NAME} ${package_dir} - -mkdir -p ${package_dir}/final_dist python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="${WHEEL_NAME}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist +RAPIDS_PY_WHEEL_NAME="cudf_${AUDITWHEEL_POLICY}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist diff --git a/ci/check_style.sh b/ci/check_style.sh index e96ad8bf1db..a01cf4dcc6b 100755 --- a/ci/check_style.sh +++ b/ci/check_style.sh @@ -14,7 +14,7 @@ rapids-dependency-file-generator \ rapids-mamba-retry env create --force -f env.yaml -n checks conda activate checks -FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/cmake-format-rapids-cmake.json +FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/cmake-format-rapids-cmake.json export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh index 920625b452f..d36b609799b 100755 --- a/ci/cudf_pandas_scripts/pandas-tests/run.sh +++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh @@ -8,16 +8,21 @@ PANDAS_TESTS_BRANCH=${1} rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch" rapids-logger "PR number: $RAPIDS_REF_NAME" - -COMMIT=$(git rev-parse HEAD) -WHEEL_NAME="cudf" -if [[ "${PANDAS_TESTS_BRANCH}" == "main" ]]; then - COMMIT=$(git merge-base HEAD origin/branch-23.10-xdf) - WHEEL_NAME="${WHEEL_NAME}_${PANDAS_TESTS_BRANCH}" +# Set the manylinux version used for downloading the wheels so that we test the +# newer ABI wheels on the newer images that support their installation. +# Need to disable pipefail for the head not to fail, see +# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q +set +o pipefail +glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2) +set -o pipefail +manylinux_version="2_17" +if [[ ${glibc_minor_version} -ge 28 ]]; then + manylinux_version="2_28" fi +manylinux="manylinux_${manylinux_version}" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="${WHEEL_NAME}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep +RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas_tests] git checkout $COMMIT diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index cc578b50fd0..7eab3221e5e 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -31,8 +31,21 @@ done if [ "$no_cudf" = true ]; then echo "Skipping cudf install" else + # Set the manylinux version used for downloading the wheels so that we test the + # newer ABI wheels on the newer images that support their installation. + # Need to disable pipefail for the head not to fail, see + # https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q + set +o pipefail + glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2) + set -o pipefail + manylinux_version="2_17" + if [[ ${glibc_minor_version} -ge 28 ]]; then + manylinux_version="2_28" + fi + manylinux="manylinux_${manylinux_version}" + RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep + RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf_pandas_tests] fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index eac64fe1a0f..16742465c32 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' # Python CMakeLists updates sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt +sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt # cpp libcudf_kafka update sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt @@ -50,17 +51,8 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' # cpp cudf_jni update sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt -# Python __init__.py updates -sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cudf/cudf/__init__.py -sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/dask_cudf/__init__.py -sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/cudf_kafka/__init__.py -sed_runner "s/__version__ = .*/__version__ = \"${NEXT_FULL_TAG}\"/g" python/custreamz/custreamz/__init__.py - -# Python pyproject.toml updates -sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf/pyproject.toml -sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/dask_cudf/pyproject.toml -sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/cudf_kafka/pyproject.toml -sed_runner "s/^version = .*/version = \"${NEXT_FULL_TAG}\"/g" python/custreamz/pyproject.toml +# Centralized version file update +echo "${NEXT_FULL_TAG}" > VERSION # Wheel testing script sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh @@ -89,6 +81,7 @@ DEPENDENCIES=( kvikio libkvikio librmm + rapids-dask-dependency rmm ) for DEP in "${DEPENDENCIES[@]}"; do @@ -108,8 +101,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md # Libcudf examples update -sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt -sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt +sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake # CI files for FILE in .github/workflows/*.yaml; do diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 83e24ab3ff1..8c42651e299 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -3,8 +3,21 @@ set -eou pipefail +# Set the manylinux version used for downloading the wheels so that we test the +# newer ABI wheels on the newer images that support their installation. +# Need to disable pipefail for the head not to fail, see +# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q +set +o pipefail +glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2) +set -o pipefail +manylinux_version="2_17" +if [[ ${glibc_minor_version} -ge 28 ]]; then + manylinux_version="2_28" +fi +manylinux="manylinux_${manylinux_version}" + RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist +RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/cudf*.whl)[test] diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index a0a6fbede13..e9162b816aa 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -7,13 +7,24 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist # Download the cudf built in the previous step -RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep -python -m pip install --no-deps ./local-cudf-dep/cudf*.whl +# Set the manylinux version used for downloading the wheels so that we test the +# newer ABI wheels on the newer images that support their installation. +# Need to disable pipefail for the head not to fail, see +# https://stackoverflow.com/questions/19120263/why-exit-code-141-with-grep-q +set +o pipefail +glibc_minor_version=$(ldd --version | head -1 | grep -o "[0-9]\.[0-9]\+" | tail -1 | cut -d '.' -f2) +set -o pipefail +manylinux_version="2_17" +if [[ ${glibc_minor_version} -ge 28 ]]; then + manylinux_version="2_28" +fi +manylinux="manylinux_${manylinux_version}" -# Always install latest dask for testing -python -m pip install git+https://github.com/dask/dask.git@2023.9.2 git+https://github.com/dask/distributed.git@2023.9.2 git+https://github.com/rapidsai/dask-cuda.git@branch-23.10 +RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep +python -m pip install --no-deps ./local-cudf-dep/cudf*.whl # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/dask_cudf*.whl)[test] +# Run tests in dask_cudf/tests and dask_cudf/io/tests python -m pytest -n 8 ./python/dask_cudf/dask_cudf/ diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 27a3a84e3f1..9b85888a7b3 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -24,11 +24,8 @@ dependencies: - cudatoolkit - cupy>=12.0.0 - cxx-compiler -- cython>=3.0.0 -- dask-core==2023.9.2 -- dask-cuda==23.10.* -- dask==2023.9.2 -- distributed==2023.9.2 +- cython>=3.0.3 +- dask-cuda==23.12.* - dlpack>=0.5,<0.6.0a0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -40,14 +37,14 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow==12.0.1.* +- libarrow-all==14.0.1.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 -- libkvikio==23.10.* +- libkvikio==23.12.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==23.10.* +- librmm==23.12.* - make - mimesis>=4.1.0 - moto>=4.0.8 @@ -60,7 +57,7 @@ dependencies: - numpy>=1.21,<1.25 - numpydoc - nvcc_linux-64=11.8 -- nvcomp==2.6.1 +- nvcomp==3.0.4 - nvtx>=0.2.1 - packaging - pandas>=1.3,<1.6.0dev0 @@ -69,9 +66,8 @@ dependencies: - pre-commit - protobuf>=4.21,<5 - ptxcompiler -- pyarrow==12.0.1.* -- pydata-sphinx-theme -- pyorc +- pyarrow==14.0.1.* +- pydata-sphinx-theme!=0.14.2 - pytest - pytest-benchmark - pytest-cases @@ -81,8 +77,9 @@ dependencies: - python-snappy>=0.6.0 - python>=3.9,<3.11 - pytorch<1.12.0 +- rapids-dask-dependency==23.12.* - rich -- rmm==23.10.* +- rmm==23.12.* - s3fs>=2022.3.0 - scikit-build>=0.13.1 - scipy diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index eb229f15af2..a3eeb3dd99f 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -25,11 +25,8 @@ dependencies: - cuda-version=12.0 - cupy>=12.0.0 - cxx-compiler -- cython>=3.0.0 -- dask-core==2023.9.2 -- dask-cuda==23.10.* -- dask==2023.9.2 -- distributed==2023.9.2 +- cython>=3.0.3 +- dask-cuda==23.12.* - dlpack>=0.5,<0.6.0a0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -41,12 +38,12 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow==12.0.1.* +- libarrow-all==14.0.1.* - libcufile-dev - libcurand-dev -- libkvikio==23.10.* +- libkvikio==23.12.* - librdkafka>=1.9.0,<1.10.0a0 -- librmm==23.10.* +- librmm==23.12.* - make - mimesis>=4.1.0 - moto>=4.0.8 @@ -58,7 +55,7 @@ dependencies: - numba>=0.57,<0.58 - numpy>=1.21,<1.25 - numpydoc -- nvcomp==2.6.1 +- nvcomp==3.0.4 - nvtx>=0.2.1 - packaging - pandas>=1.3,<1.6.0dev0 @@ -66,9 +63,8 @@ dependencies: - pip - pre-commit - protobuf>=4.21,<5 -- pyarrow==12.0.1.* -- pydata-sphinx-theme -- pyorc +- pyarrow==14.0.1.* +- pydata-sphinx-theme!=0.14.2 - pytest - pytest-benchmark - pytest-cases @@ -78,8 +74,9 @@ dependencies: - python-snappy>=0.6.0 - python>=3.9,<3.11 - pytorch<1.12.0 +- rapids-dask-dependency==23.12.* - rich -- rmm==23.10.* +- rmm==23.12.* - s3fs>=2022.3.0 - scikit-build>=0.13.1 - scipy diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index d3e15f70ccb..27edde1c98a 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,6 +1,6 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -12,7 +12,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} @@ -55,13 +55,13 @@ requirements: - cuda-version ={{ cuda_version }} - sysroot_{{ target_platform }} {{ sysroot_version }} host: - - protobuf ==4.21.* + - protobuf ==4.24.* - python - - cython >=3.0.0 + - cython >=3.0.3 - scikit-build >=0.13.1 - setuptools - dlpack >=0.5,<0.6.0a0 - - pyarrow =12 + - pyarrow ==14.0.1.* - libcudf ={{ version }} - rmm ={{ minor_version }} {% if cuda_major == "11" %} @@ -82,7 +82,7 @@ requirements: - numba >=0.57,<0.58 # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - numpy >=1.21,<1.25 - - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} + - {{ pin_compatible('pyarrow', max_pin='x') }} - libcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh index f4bb6e1bc91..9458349d101 100644 --- a/conda/recipes/cudf_kafka/build.sh +++ b/conda/recipes/cudf_kafka/build.sh @@ -1,16 +1,3 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -# This assumes the script is executed from the root of the repo directory -# Need to set CUDA_HOME inside conda environments because the hacked together -# setup.py for cudf-kafka searches that way. -# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates -# cudf_kafka to use scikit-build -CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*} -if [[ ${CUDA_MAJOR} == "12" ]]; then - target_name="x86_64-linux" - if [[ ! $(arch) == "x86_64" ]]; then - target_name="sbsa-linux" - fi - export CUDA_HOME="${PREFIX}/targets/${target_name}/" -fi ./build.sh -v cudf_kafka diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index b63a136ad2d..c98c2701653 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -9,3 +9,9 @@ sysroot_version: cmake_version: - ">=3.26.4" + +cuda_compiler: + - cuda-nvcc + +cuda11_compiler: + - nvcc diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index a79c23b7d98..343ec2519f1 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -1,6 +1,6 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -12,7 +12,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} @@ -33,28 +33,31 @@ build: - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64] - SCCACHE_S3_USE_SSL - SCCACHE_S3_NO_CREDENTIALS - # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates - # cudf_kafka to use scikit-build - - RAPIDS_CUDA_VERSION + ignore_run_exports_from: + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - cmake {{ cmake_version }} + - ninja - {{ compiler('c') }} - {{ compiler('cxx') }} - - ninja - - sysroot_{{ target_platform }} {{ sysroot_version }} - # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates - # cudf_kafka to use scikit-build - {% if cuda_major == "12" %} - - cuda-gdb + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} {% endif %} + - cuda-version ={{ cuda_version }} + - sysroot_{{ target_platform }} {{ sysroot_version }} host: - python - - cython >=3.0.0 + - cython >=3.0.3 - cuda-version ={{ cuda_version }} - cudf ={{ version }} - libcudf_kafka ={{ version }} + - scikit-build >=0.13.1 - setuptools {% if cuda_major == "12" %} - cuda-cudart-dev diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 233d51baf31..755394e3936 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -1,6 +1,6 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -12,7 +12,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} @@ -45,9 +45,7 @@ requirements: - streamz - cudf ={{ version }} - cudf_kafka ={{ version }} - - dask ==2023.9.2 - - dask-core ==2023.9.2 - - distributed ==2023.9.2 + - rapids-dask-dependency ={{ minor_version }} - python-confluent-kafka >=1.9.0,<1.10.0a0 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 4c8af071074..16638926492 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -1,6 +1,6 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version = environ['CONDA_PY'] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} @@ -12,7 +12,7 @@ package: version: {{ version }} source: - git_url: ../../.. + path: ../../.. build: number: {{ GIT_DESCRIBE_NUMBER }} @@ -37,17 +37,11 @@ build: requirements: host: - python - - cudf ={{ version }} - - dask ==2023.9.2 - - dask-core ==2023.9.2 - - distributed ==2023.9.2 - cuda-version ={{ cuda_version }} run: - python - cudf ={{ version }} - - dask ==2023.9.2 - - dask-core ==2023.9.2 - - distributed ==2023.9.2 + - rapids-dask-dependency ={{ minor_version }} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh deleted file mode 100644 index c79c014a89a..00000000000 --- a/conda/recipes/dask-cudf/run_test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -set -e - -# Logger function for build status output -function logger() { - echo -e "\n>>>> $@\n" -} - -# Importing cudf on arm64 CPU only nodes is currently not working due to a -# difference in reported gpu devices between arm64 and amd64 -ARCH=$(arch) - -if [ "${ARCH}" = "aarch64" ]; then - logger "Skipping tests on arm64" - exit 0 -fi - -# Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=0 - -# Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2023.9.2" - -# Install the conda-forge or nightly version of dask and distributed -if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then - rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'" - rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed" -else - rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" - rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall -fi - -logger "python -c 'import dask_cudf'" -python -c "import dask_cudf" diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 25b3f19de77..fa06ed048b7 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -23,7 +23,7 @@ gtest_version: - ">=1.13.0" libarrow_version: - - "=12" + - "==14.0.1" dlpack_version: - ">=0.5,<0.6.0a0" @@ -38,7 +38,7 @@ spdlog_version: - ">=1.11.0,<1.12" nvcomp_version: - - "=2.6.1" + - "=3.0.4" zlib_version: - ">=1.2.13" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 627065817ba..0459908fd00 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -1,6 +1,6 @@ # Copyright (c) 2018-2023, NVIDIA CORPORATION. -{% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') %} +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} {% set cuda_major = cuda_version.split('.')[0] %} @@ -11,7 +11,7 @@ package: name: libcudf-split source: - git_url: ../../.. + path: ../../.. build: script_env: @@ -91,6 +91,8 @@ outputs: requirements: build: - cmake {{ cmake_version }} + host: + - libarrow {{ libarrow_version }} run: {% if cuda_major == "11" %} - cudatoolkit @@ -103,7 +105,6 @@ outputs: - nvcomp {{ nvcomp_version }} - librmm ={{ minor_version }} - libkvikio ={{ minor_version }} - - libarrow {{ libarrow_version }} - dlpack {{ dlpack_version }} - gtest {{ gtest_version }} - gmock {{ gtest_version }} diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ec58c391001..bd9c936626a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF) project( CUDF - VERSION 23.10.00 + VERSION 23.12.00 LANGUAGES C CXX CUDA ) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5) @@ -401,6 +401,7 @@ add_library( src/io/parquet/predicate_pushdown.cpp src/io/parquet/reader.cpp src/io/parquet/reader_impl.cpp + src/io/parquet/reader_impl_chunking.cu src/io/parquet/reader_impl_helpers.cpp src/io/parquet/reader_impl_preprocess.cu src/io/parquet/writer_impl.cu @@ -439,6 +440,7 @@ add_library( src/join/mixed_join_size_kernel_nulls.cu src/join/mixed_join_size_kernels_semi.cu src/join/semi_join.cu + src/json/json_path.cu src/lists/contains.cu src/lists/combine/concatenate_list_elements.cu src/lists/combine/concatenate_rows.cu @@ -570,7 +572,6 @@ add_library( src/strings/filter_chars.cu src/strings/like.cu src/strings/padding.cu - src/strings/json/json_path.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cpp src/strings/regex/regex_program.cpp @@ -581,6 +582,7 @@ add_library( src/strings/replace/replace.cu src/strings/replace/replace_re.cu src/strings/reverse.cu + src/strings/scan/scan_inclusive.cu src/strings/search/findall.cu src/strings/search/find.cu src/strings/search/find_multiple.cu @@ -597,6 +599,7 @@ add_library( src/strings/utilities.cu src/strings/wrap.cu src/structs/copying/concatenate.cu + src/structs/scan/scan_inclusive.cu src/structs/structs_column_factories.cu src/structs/structs_column_view.cpp src/structs/utilities.cpp @@ -613,10 +616,10 @@ add_library( src/text/normalize.cu src/text/replace.cu src/text/stemmer.cu - src/text/subword/bpe_tokenizer.cu + src/text/bpe/byte_pair_encoding.cu + src/text/bpe/load_merge_pairs.cu src/text/subword/data_normalizer.cu src/text/subword/load_hash_file.cu - src/text/subword/load_merges_file.cu src/text/subword/subword_tokenize.cu src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu @@ -834,6 +837,7 @@ if(CUDF_BUILD_TESTUTIL) tests/io/metadata_utilities.cpp tests/utilities/base_fixture.cpp tests/utilities/column_utilities.cu + tests/utilities/debug_utilities.cu tests/utilities/table_utilities.cu tests/utilities/tdigest_utilities.cu ) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index cd6b3cfdc03..9c3a05a2f5f 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -230,6 +230,7 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp) # ################################################################################################## # * merge benchmark ------------------------------------------------------------------------------- ConfigureBench(MERGE_BENCH merge/merge.cpp) +ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp) # ################################################################################################## # * null_mask benchmark --------------------------------------------------------------------------- @@ -277,7 +278,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## @@ -319,7 +320,7 @@ ConfigureNVBench( # ################################################################################################## # * json benchmark ------------------------------------------------------------------- -ConfigureBench(JSON_BENCH string/json.cu) +ConfigureBench(JSON_BENCH json/json.cu) ConfigureNVBench(FST_NVBENCH io/fst.cu) ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) diff --git a/cpp/benchmarks/sort/nested_types_common.hpp b/cpp/benchmarks/common/generate_nested_types.hpp similarity index 98% rename from cpp/benchmarks/sort/nested_types_common.hpp rename to cpp/benchmarks/common/generate_nested_types.hpp index 93853ba5768..ee9e3ca9de3 100644 --- a/cpp/benchmarks/sort/nested_types_common.hpp +++ b/cpp/benchmarks/common/generate_nested_types.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include "generate_input.hpp" #include diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu index 460100a8fe9..e1169e3bcd6 100644 --- a/cpp/benchmarks/copying/shift.cu +++ b/cpp/benchmarks/copying/shift.cu @@ -56,18 +56,32 @@ static void BM_shift(benchmark::State& state) cudf::size_type size = state.range(0); cudf::size_type offset = size * (static_cast(shift_factor) / 100.0); - auto const input_table = - create_sequence_table({cudf::type_to_id()}, - row_count{size}, - use_validity ? std::optional{1.0} : std::nullopt); + auto constexpr column_type_id = cudf::type_id::INT32; + using column_type = cudf::id_to_type; + + auto const input_table = create_sequence_table( + {column_type_id}, row_count{size}, use_validity ? std::optional{1.0} : std::nullopt); cudf::column_view input{input_table->get_column(0)}; - auto fill = use_validity ? make_scalar() : make_scalar(777); + auto fill = use_validity ? make_scalar() : make_scalar(777); for (auto _ : state) { cuda_event_timer raii(state, true); auto output = cudf::shift(input, offset, *fill); } + + auto const elems_read = (size - offset); + auto const bytes_read = elems_read * sizeof(column_type); + + // If 'use_validity' is false, the fill value is a number, and the entire column + // (excluding the null bitmask) needs to be written. On the other hand, if 'use_validity' + // is true, only the elements that can be shifted are written, along with the full null bitmask. + auto const elems_written = use_validity ? (size - offset) : size; + auto const bytes_written = elems_written * sizeof(column_type); + auto const null_bytes = use_validity ? 2 * cudf::bitmask_allocation_size_bytes(size) : 0; + + state.SetBytesProcessed(static_cast(state.iterations()) * + (bytes_written + bytes_read + null_bytes)); } class Shift : public cudf::benchmark {}; diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/json/json.cu similarity index 98% rename from cpp/benchmarks/string/json.cu rename to cpp/benchmarks/json/json.cu index 7e89edf3e17..5dc30aebe38 100644 --- a/cpp/benchmarks/string/json.cu +++ b/cpp/benchmarks/json/json.cu @@ -21,9 +21,9 @@ #include #include +#include #include #include -#include #include #include #include @@ -196,7 +196,7 @@ void BM_case(benchmark::State& state, std::string query_arg) for (auto _ : state) { cuda_event_timer raii(state, true); - auto result = cudf::strings::get_json_object(scv, json_path); + auto result = cudf::get_json_object(scv, json_path); CUDF_CUDA_TRY(cudaStreamSynchronize(0)); } diff --git a/cpp/benchmarks/lists/set_operations.cpp b/cpp/benchmarks/lists/set_operations.cpp index 5b240923358..6bed33d2570 100644 --- a/cpp/benchmarks/lists/set_operations.cpp +++ b/cpp/benchmarks/lists/set_operations.cpp @@ -54,6 +54,7 @@ void nvbench_set_op(nvbench::state& state, BenchFuncPtr bfunc) cudf::lists_column_view{*rhs}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL, + cudf::get_default_stream(), rmm::mr::get_current_device_resource()); }); } diff --git a/cpp/benchmarks/merge/merge_lists.cpp b/cpp/benchmarks/merge/merge_lists.cpp new file mode 100644 index 00000000000..bcb9f10ac83 --- /dev/null +++ b/cpp/benchmarks/merge/merge_lists.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +void nvbench_merge_list(nvbench::state& state) +{ + rmm::cuda_stream_view stream; + + auto const input1 = create_lists_data(state); + auto const sorted_input1 = + cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource()); + + auto const input2 = create_lists_data(state); + auto const sorted_input2 = + cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource()); + + stream.synchronize(); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + + cudf::detail::merge({*sorted_input1, *sorted_input2}, + {0}, + {cudf::order::ASCENDING}, + {}, + stream_view, + rmm::mr::get_current_device_resource()); + }); +} + +NVBENCH_BENCH(nvbench_merge_list) + .set_name("merge_lists") + .add_int64_power_of_two_axis("size_bytes", {10, 18, 24, 28}) + .add_int64_axis("depth", {1, 4}) + .add_float64_axis("null_frequency", {0, 0.2}); diff --git a/cpp/benchmarks/merge/merge_structs.cpp b/cpp/benchmarks/merge/merge_structs.cpp new file mode 100644 index 00000000000..9c56b44b623 --- /dev/null +++ b/cpp/benchmarks/merge/merge_structs.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +void nvbench_merge_struct(nvbench::state& state) +{ + rmm::cuda_stream_view stream; + + auto const input1 = create_structs_data(state); + auto const sorted_input1 = + cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource()); + + auto const input2 = create_structs_data(state); + auto const sorted_input2 = + cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource()); + + stream.synchronize(); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + + cudf::detail::merge({*sorted_input1, *sorted_input2}, + {0}, + {cudf::order::ASCENDING}, + {}, + stream_view, + rmm::mr::get_current_device_resource()); + }); +} + +NVBENCH_BENCH(nvbench_merge_struct) + .set_name("merge_struct") + .add_int64_power_of_two_axis("NumRows", {10, 18, 26}) + .add_int64_axis("Depth", {0, 1, 8}) + .add_int64_axis("Nulls", {0, 1}); diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp index 49dc409ebfc..c23f3c891f0 100644 --- a/cpp/benchmarks/sort/rank_lists.cpp +++ b/cpp/benchmarks/sort/rank_lists.cpp @@ -14,9 +14,10 @@ * limitations under the License. */ -#include "nested_types_common.hpp" #include "rank_types_common.hpp" +#include + #include #include diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp index 85427e2128f..271b883e62a 100644 --- a/cpp/benchmarks/sort/rank_structs.cpp +++ b/cpp/benchmarks/sort/rank_structs.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "nested_types_common.hpp" #include "rank_types_common.hpp" +#include #include diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp index 4b04323a99f..2052de3688c 100644 --- a/cpp/benchmarks/sort/sort_lists.cpp +++ b/cpp/benchmarks/sort/sort_lists.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "nested_types_common.hpp" +#include #include diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp index 1d54fa42f6f..3a3d1080ba0 100644 --- a/cpp/benchmarks/sort/sort_structs.cpp +++ b/cpp/benchmarks/sort/sort_structs.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "nested_types_common.hpp" +#include #include diff --git a/cpp/benchmarks/stream_compaction/distinct_count.cpp b/cpp/benchmarks/stream_compaction/distinct_count.cpp index 2b2c901b90f..3e324013d4e 100644 --- a/cpp/benchmarks/stream_compaction/distinct_count.cpp +++ b/cpp/benchmarks/stream_compaction/distinct_count.cpp @@ -40,6 +40,14 @@ static void bench_distinct_count(nvbench::state& state, nvbench::type_list auto const& data_column = data_table->get_column(0); auto const input_table = cudf::table_view{{data_column, data_column, data_column}}; + // Collect memory statistics for input and output. + state.add_global_memory_reads(input_table.num_rows() * input_table.num_columns()); + state.add_global_memory_writes(1); + if (null_probability > 0) { + state.add_global_memory_reads( + input_table.num_columns() * cudf::bitmask_allocation_size_bytes(input_table.num_rows())); + } + auto mem_stats_logger = cudf::memory_stats_logger(); // init stats logger state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp new file mode 100644 index 00000000000..80942e2697d --- /dev/null +++ b/cpp/benchmarks/text/vocab.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include + +static void bench_vocab_tokenize(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + + if (static_cast(num_rows) * static_cast(row_width) >= + static_cast(std::numeric_limits::max())) { + state.skip("Skip benchmarks greater than size_type limit"); + } + + auto const column = [num_rows, row_width] { + data_profile const profile = data_profile_builder().no_validity().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); + return cudf::strings::filter_characters_of_type( + cudf::strings_column_view(col->view()), + cudf::strings::string_character_types::ALL_TYPES, + cudf::string_scalar(" "), + cudf::strings::string_character_types::ALPHANUM); + }(); + cudf::strings_column_view input(column->view()); + + auto const vocab_col = [] { + data_profile const profile = data_profile_builder().no_validity().distribution( + cudf::type_id::STRING, distribution_id::NORMAL, 0, 15); + auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile); + return cudf::strings::filter_characters_of_type( + cudf::strings_column_view(col->view()), + cudf::strings::string_character_types::ALL_TYPES, + cudf::string_scalar(""), + cudf::strings::string_character_types::ALPHANUM); + }(); + auto const vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocab_col->view())); + + auto token_count = [input] { + auto const counts = nvtext::count_tokens(input); + auto const agg = cudf::make_sum_aggregation(); + auto const count = cudf::reduce(counts->view(), *agg, counts->type()); + return static_cast*>(count.get()) + ->value(cudf::get_default_stream()); + }(); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size() + cudf::strings_column_view(vocab_col->view()).chars_size(); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(token_count); + + auto const delimiter = cudf::string_scalar(""); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::tokenize_with_vocabulary(input, *vocab, delimiter); + }); +} + +NVBENCH_BENCH(bench_vocab_tokenize) + .set_name("vocab_tokenize") + .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024}) + .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216}); diff --git a/cpp/benchmarks/transpose/transpose.cpp b/cpp/benchmarks/transpose/transpose.cpp index 2f41bda4b88..c2737325462 100644 --- a/cpp/benchmarks/transpose/transpose.cpp +++ b/cpp/benchmarks/transpose/transpose.cpp @@ -20,17 +20,19 @@ #include #include #include +#include #include #include static void BM_transpose(benchmark::State& state) { - auto count = state.range(0); + auto count = state.range(0); + constexpr auto column_type_id = cudf::type_id::INT32; auto int_column_generator = thrust::make_transform_iterator(thrust::counting_iterator(0), [count](int i) { return cudf::make_numeric_column( - cudf::data_type{cudf::type_id::INT32}, count, cudf::mask_state::ALL_VALID); + cudf::data_type{column_type_id}, count, cudf::mask_state::ALL_VALID); }); auto input_table = cudf::table(std::vector(int_column_generator, int_column_generator + count)); @@ -40,6 +42,17 @@ static void BM_transpose(benchmark::State& state) cuda_event_timer raii(state, true); auto output = cudf::transpose(input); } + + // Collect memory statistics. + auto const bytes_read = static_cast(input.num_columns()) * input.num_rows() * + sizeof(cudf::id_to_type); + auto const bytes_written = bytes_read; + // Account for nullability in input and output. + auto const null_bytes = 2 * static_cast(input.num_columns()) * + cudf::bitmask_allocation_size_bytes(input.num_rows()); + + state.SetBytesProcessed(static_cast(state.iterations()) * + (bytes_read + bytes_written + null_bytes)); } class Transpose : public cudf::benchmark {}; diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 894dc9649e2..05aa5730b4d 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -53,19 +53,35 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION) find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) add_library(arrow_shared ALIAS Arrow::Arrow) - # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's - # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note - # that these flags will often be redundant because we build wheels in manylinux containers that - # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent - # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer - # containers. Note that tests will not build successfully without also propagating these options - # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly - # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using - # this feature except for building wheels. - target_compile_options( - Arrow::Arrow INTERFACE "$<$:-D_GLIBCXX_USE_CXX11_ABI=0>" - "$<$:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>" + # When using the libarrow inside a wheel, whether or not libcudf may be built using the new C++11 + # ABI is dependent on whether the libarrow inside the wheel was compiled using that ABI because we + # need the arrow library that we bundle in cudf to be ABI-compatible with the one inside pyarrow. + # We determine what options to use by checking the glibc version on the current system, which is + # also how pip determines which manylinux-versioned pyarrow wheel to install. Note that tests will + # not build successfully without also propagating these options to builds of GTest. Similarly, + # benchmarks will not work without updating GBench (and possibly NVBench) builds. We are currently + # ignoring these limitations since we don't anticipate using this feature except for building + # wheels. + EXECUTE_PROCESS( + COMMAND ${CMAKE_C_COMPILER} -print-file-name=libc.so.6 + OUTPUT_VARIABLE GLIBC_EXECUTABLE + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + EXECUTE_PROCESS( + COMMAND ${GLIBC_EXECUTABLE} + OUTPUT_VARIABLE GLIBC_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE ) + STRING(REGEX MATCH "stable release version ([0-9]+\\.[0-9]+)" GLIBC_VERSION ${GLIBC_OUTPUT}) + STRING(REPLACE "stable release version " "" GLIBC_VERSION ${GLIBC_VERSION}) + STRING(REPLACE "." ";" GLIBC_VERSION_LIST ${GLIBC_VERSION}) + LIST(GET GLIBC_VERSION_LIST 1 GLIBC_VERSION_MINOR) + if(GLIBC_VERSION_MINOR LESS 28) + target_compile_options( + Arrow::Arrow INTERFACE "$<$:-D_GLIBCXX_USE_CXX11_ABI=0>" + "$<$:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>" + ) + endif() rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) @@ -387,11 +403,19 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) - if(ENABLE_PARQUET) - rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) - rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) - endif() + rapids_export_find_package_root( + BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) + rapids_export_find_package_root( + BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + rapids_export_find_package_root( + BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" @@ -403,7 +427,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) set(CUDF_VERSION_Arrow # This version must be kept in sync with the libarrow version pinned for builds in # dependencies.yaml. - 12.0.1 + 14.0.1 CACHE STRING "The version of Arrow to find (or build)" ) endif() diff --git a/cpp/cmake/thirdparty/get_cufile.cmake b/cpp/cmake/thirdparty/get_cufile.cmake index c0235eba508..bfdff3a99ff 100644 --- a/cpp/cmake/thirdparty/get_cufile.cmake +++ b/cpp/cmake/thirdparty/get_cufile.cmake @@ -21,10 +21,10 @@ function(find_and_configure_cufile) if(cuFile_FOUND AND NOT BUILD_SHARED_LIBS) include("${rapids-cmake-dir}/export/find_package_file.cmake") rapids_export_find_package_file( - BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports + BUILD "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports ) rapids_export_find_package_file( - INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" cudf-exports + INSTALL "${CUDF_SOURCE_DIR}/cmake/Modules/FindcuFile.cmake" EXPORT_SET cudf-exports ) endif() endfunction() diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake index 1363f43fae2..cfb219448f1 100644 --- a/cpp/cmake/thirdparty/get_gtest.cmake +++ b/cpp/cmake/thirdparty/get_gtest.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -30,7 +30,7 @@ function(find_and_configure_gtest) include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( - BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-testing-exports + BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-testing-exports ) endif() diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index e94e024d6c9..20712beec41 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -25,10 +25,12 @@ function(find_and_configure_kvikio VERSION) OPTIONS "KvikIO_BUILD_EXAMPLES OFF" ) - if(KvikIO_BINARY_DIR) - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root(BUILD KvikIO "${KvikIO_BINARY_DIR}" cudf-exports) - endif() + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + BUILD KvikIO "${KvikIO_BINARY_DIR}" + EXPORT_SET cudf-exports + CONDITION KvikIO_BINARY_DIR + ) endfunction() diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake index 0e03352c335..285d66287f3 100644 --- a/cpp/cmake/thirdparty/get_libcudacxx.cmake +++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake @@ -22,16 +22,14 @@ function(find_and_configure_libcudacxx) include(${rapids-cmake-dir}/cpm/libcudacxx.cmake) rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) - if(libcudacxx_SOURCE_DIR) - # Store where CMake can find our custom Thrust install - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - INSTALL - libcudacxx - [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=] - cudf-exports - ) - endif() + # Store where CMake can find our custom Thrust install + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + INSTALL libcudacxx + [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/libcudacxx]=] + EXPORT_SET cudf-exports + CONDITION libcudacxx_SOURCE_DIR + ) endfunction() find_and_configure_libcudacxx() diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake index f0642145fa0..bbd22693ba4 100644 --- a/cpp/cmake/thirdparty/get_nvbench.cmake +++ b/cpp/cmake/thirdparty/get_nvbench.cmake @@ -21,7 +21,7 @@ function(find_and_configure_nvbench) set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json") - rapids_cpm_nvbench() + rapids_cpm_nvbench(BUILD_STATIC) endfunction() diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake index fff5b84af0d..c0e07d02d94 100644 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -27,7 +27,9 @@ function(find_and_configure_spdlog) NAMESPACE spdlog:: ) include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root(BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) + rapids_export_find_package_root( + BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) endif() endfunction() diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 39a9de15fa6..67ed4287d7b 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -33,14 +33,13 @@ function(find_and_configure_thrust) INSTALL_EXPORT_SET cudf-exports ) - if(Thrust_SOURCE_DIR) - # Store where CMake can find our custom Thrust install - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - INSTALL Thrust - [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=] cudf-exports - ) - endif() + # Store where CMake can find our custom Thrust install + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/rapids/cmake/thrust]=] + EXPORT_SET cudf-exports + CONDITION Thrust_SOURCE_DIR + ) endfunction() find_and_configure_thrust() diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json index 7be868081b6..f85bdb9486c 100644 --- a/cpp/cmake/thirdparty/patches/nvbench_override.json +++ b/cpp/cmake/thirdparty/patches/nvbench_override.json @@ -9,8 +9,8 @@ "fixed_in" : "" }, { - "file" : "nvbench/use_existing_fmt.diff", - "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]", + "file" : "nvbench/nvml_with_static_builds.diff", + "issue" : "Add support for nvml with static nvbench [https://github.com/NVIDIA/nvbench/pull/148]", "fixed_in" : "" } ] diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index b072d252881..adefaaa1479 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = libcudf # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 23.10.00 +PROJECT_NUMBER = 23.12.00 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -2226,7 +2226,7 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/23.10 +TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/23.12 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to diff --git a/cpp/examples/README.md b/cpp/examples/README.md index b2e8dd399d0..7f2b769f4a5 100644 --- a/cpp/examples/README.md +++ b/cpp/examples/README.md @@ -7,3 +7,4 @@ Current examples: - Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf - Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings +- Nested Types: demonstrates using libcudf for some operations on nested types diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index 1c1952c4616..759a43b5627 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -8,23 +8,7 @@ project( LANGUAGES CXX CUDA ) -set(CPM_DOWNLOAD_VERSION v0.35.3) -file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake - ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake -) -include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) - -set(CUDF_TAG branch-23.10) -CPMFindPackage( - NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf - GIT_TAG ${CUDF_TAG} - GIT_SHALLOW - TRUE - SOURCE_SUBDIR - cpp -) +include(../fetch_dependencies.cmake) # Configure your project here add_executable(basic_example src/process_csv.cpp) diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 7d389cd318d..001cdeec694 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # libcudf examples build script @@ -14,18 +14,17 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")} ################################################################################ # Add individual libcudf examples build scripts down below -# Basic example -BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic -BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build -# Configure -cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" -# Build -cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} - -# Strings example -STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings -STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build -# Configure -cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" -# Build -cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} +build_example() { + example_dir=${1} + example_dir="${EXAMPLES_DIR}/${example_dir}" + build_dir="${example_dir}/build" + + # Configure + cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}" + # Build + cmake --build ${build_dir} -j${PARALLEL_LEVEL} +} + +build_example basic +build_example strings +build_example nested_types diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake new file mode 100644 index 00000000000..dc86c6a9aa5 --- /dev/null +++ b/cpp/examples/fetch_dependencies.cmake @@ -0,0 +1,30 @@ +# ============================================================================= +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= +set(CPM_DOWNLOAD_VERSION v0.35.3) +file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake + ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake +) +include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) + +set(CUDF_TAG branch-23.12) +CPMFindPackage( + NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf + GIT_TAG ${CUDF_TAG} + GIT_SHALLOW + TRUE + SOURCE_SUBDIR + cpp +) diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt new file mode 100644 index 00000000000..cb9430db237 --- /dev/null +++ b/cpp/examples/nested_types/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.26.4) + +project( + nested_types + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +include(../fetch_dependencies.cmake) + +# Configure your project here +add_executable(deduplication deduplication.cpp) +target_link_libraries(deduplication PRIVATE cudf::cudf) +target_compile_features(deduplication PRIVATE cxx_std_17) diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp new file mode 100644 index 00000000000..5969985cc72 --- /dev/null +++ b/cpp/examples/nested_types/deduplication.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +/** + * @file deduplication.cpp + * @brief Demonstrates usage of the libcudf APIs to perform operations on nested-type tables. + * + * The algorithms chosen to be demonstrated are to showcase nested-type row operators of three + * kinds: + * 1. hashing: Used by functions `count_aggregate` and `join_count` to hash inputs of any type + * 2. equality: Used by functions `count_aggregate` and `join_count` in conjunction with hashing + * to determine equality for nested types + * 3. lexicographic: Used by function `sort_keys` to create a lexicographical order for nested-types + * so as to enable sorting + * + */ + +/** + * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance + */ +std::shared_ptr create_memory_resource(bool pool) +{ + auto cuda_mr = std::make_shared(); + if (pool) { return rmm::mr::make_owning_wrapper(cuda_mr); } + return cuda_mr; +} + +/** + * @brief Read JSON input from file + * + * @param filepath path to input JSON file + * @return cudf::io::table_with_metadata + */ +cudf::io::table_with_metadata read_json(std::string filepath) +{ + auto source_info = cudf::io::source_info(filepath); + auto builder = cudf::io::json_reader_options::builder(source_info).lines(true); + auto options = builder.build(); + return cudf::io::read_json(options); +} + +/** + * @brief Write JSON output to file + * + * @param input table to write + * @param metadata metadata of input table read by JSON reader + * @param filepath path to output JSON file + */ +void write_json(cudf::table_view input, cudf::io::table_metadata metadata, std::string filepath) +{ + // write the data for inspection + auto sink_info = cudf::io::sink_info(filepath); + auto builder = cudf::io::json_writer_options::builder(sink_info, input).lines(true); + builder.metadata(metadata); + auto options = builder.build(); + cudf::io::write_json(options); +} + +/** + * @brief Aggregate count of duplicate rows in nested-type column + * + * @param input table to aggregate + * @return std::unique_ptr + */ +std::unique_ptr count_aggregate(cudf::table_view input) +{ + // Get count for each key + auto keys = cudf::table_view{{input.column(0)}}; + auto val = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, keys.num_rows()); + + cudf::groupby::groupby grpby_obj(keys); + std::vector requests; + requests.emplace_back(cudf::groupby::aggregation_request()); + auto agg = cudf::make_count_aggregation(); + requests[0].aggregations.push_back(std::move(agg)); + requests[0].values = *val; + auto agg_results = grpby_obj.aggregate(requests); + auto result_key = std::move(agg_results.first); + auto result_val = std::move(agg_results.second[0].results[0]); + + auto left_cols = result_key->release(); + left_cols.push_back(std::move(result_val)); + + return std::make_unique(std::move(left_cols)); +} + +/** + * @brief Join each row with its duplicate counts + * + * @param left left table + * @param right right table + * @return std::unique_ptr + */ +std::unique_ptr join_count(cudf::table_view left, cudf::table_view right) +{ + auto [left_indices, right_indices] = + cudf::inner_join(cudf::table_view{{left.column(0)}}, cudf::table_view{{right.column(0)}}); + auto new_left = cudf::gather(left, cudf::device_span{*left_indices}); + auto new_right = cudf::gather(right, cudf::device_span{*right_indices}); + + auto left_cols = new_left->release(); + auto right_cols = new_right->release(); + left_cols.push_back(std::move(right_cols[1])); + + return std::make_unique(std::move(left_cols)); +} + +/** + * @brief Sort nested-type column + * + * @param input table to sort + * @return std::unique_ptr + * + * @note if stability is desired, use `cudf::stable_sorted_order` + */ +std::unique_ptr sort_keys(cudf::table_view input) +{ + auto sort_order = cudf::sorted_order(cudf::table_view{{input.column(0)}}); + return cudf::gather(input, *sort_order); +} + +/** + * @brief Main for nested_types examples + * + * Command line parameters: + * 1. JSON input file name/path (default: "example.json") + * 2. JSON output file name/path (default: "output.json") + * 3. Memory resource (optional): "pool" or "cuda" (default: "pool") + * + * Example invocation from directory `cudf/cpp/examples/nested_types`: + * ./build/deduplication example.json output.json pool + * + */ +int main(int argc, char const** argv) +{ + std::string input_filepath; + std::string output_filepath; + std::string mr_name; + if (argc != 4 && argc != 1) { + std::cout << "Either provide all command-line arguments, or none to use defaults" << std::endl; + return 1; + } + if (argc == 1) { + input_filepath = "example.json"; + output_filepath = "output.json"; + mr_name = "pool"; + } else { + input_filepath = argv[1]; + output_filepath = argv[2]; + mr_name = argv[3]; + } + + auto pool = mr_name == "pool"; + auto resource = create_memory_resource(pool); + rmm::mr::set_current_device_resource(resource.get()); + + std::cout << "Reading " << input_filepath << "..." << std::endl; + // read input file + auto [input, metadata] = read_json(input_filepath); + + auto count = count_aggregate(input->view()); + + auto combined = join_count(input->view(), count->view()); + + auto sorted = sort_keys(combined->view()); + + metadata.schema_info.emplace_back("count"); + + std::cout << "Writing " << output_filepath << "..." << std::endl; + write_json(sorted->view(), metadata, output_filepath); + + return 0; +} diff --git a/cpp/examples/nested_types/example.json b/cpp/examples/nested_types/example.json new file mode 100644 index 00000000000..efaa37817d6 --- /dev/null +++ b/cpp/examples/nested_types/example.json @@ -0,0 +1,5 @@ +{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_a", "quality": 0.7} +{"features": {"key": "a2", "values": [{"info": "message_2", "type": "device_a", "dt": 1688750002}]}, "source": "network_a", "quality": 0.7} +{"features": {"key": "a3", "values": [{"info": "message_3", "type": "device_a", "dt": 1688750003}]}, "source": "network_b", "quality": 0.8} +{"features": {"key": "a1", "values": [{"info": "message_1", "type": "device_a", "dt": 1688750001}]}, "source": "network_b", "quality": 0.9} +{"features": {"key": "a4", "values": [{"info": "message_4", "type": "device_a", "dt": 1688750004}]}, "source": "network_b", "quality": 0.9} diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt index 31a6b12a4bc..c90fa9dde16 100644 --- a/cpp/examples/strings/CMakeLists.txt +++ b/cpp/examples/strings/CMakeLists.txt @@ -8,23 +8,7 @@ project( LANGUAGES CXX CUDA ) -set(CPM_DOWNLOAD_VERSION v0.35.3) -file( - DOWNLOAD - https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake - ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake -) -include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) - -set(CUDF_TAG branch-23.10) -CPMFindPackage( - NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf - GIT_TAG ${CUDF_TAG} - GIT_SHALLOW - TRUE - SOURCE_SUBDIR - cpp -) +include(../fetch_dependencies.cmake) list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index db0abe435b0..a36a831a7aa 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -67,8 +67,8 @@ struct alignas(8) device_data_reference { bool operator==(device_data_reference const& rhs) const { - return std::tie(data_index, reference_type, table_source) == - std::tie(rhs.data_index, rhs.reference_type, rhs.table_source); + return std::tie(data_index, data_type, reference_type, table_source) == + std::tie(rhs.data_index, rhs.data_type, rhs.reference_type, rhs.table_source); } }; diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index 77d6a4d1e89..9df4b4eb00f 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -102,6 +102,7 @@ enum class binary_operator : int32_t { * @param rhs The right operand column * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -115,6 +116,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -131,6 +133,7 @@ std::unique_ptr binary_operation( * @param rhs The right operand scalar * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -144,6 +147,7 @@ std::unique_ptr binary_operation( scalar const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -158,6 +162,7 @@ std::unique_ptr binary_operation( * @param rhs The right operand column * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -172,6 +177,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -189,6 +195,7 @@ std::unique_ptr binary_operation( * @param output_type The desired data type of the output column. It is assumed * that output_type is compatible with the output data type * of the function in the PTX code + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -201,6 +208,7 @@ std::unique_ptr binary_operation( column_view const& rhs, std::string const& ptx, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 35851a99822..b1ff0bbaea7 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -442,10 +443,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base { __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset - auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data(); char const* d_strings = d_children[strings_column_view::chars_column_index].data(); - size_type offset = d_offsets[index]; - return string_view{d_strings + offset, d_offsets[index + 1] - offset}; + auto const offsets = d_children[strings_column_view::offsets_column_index]; + auto const itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const offset = itr[index]; + return string_view{d_strings + offset, static_cast(itr[index + 1] - offset)}; } private: diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 955f9914632..c9975ef2199 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -673,14 +673,20 @@ std::unique_ptr gather(table_view const& source_table, mr)); } - auto const nullable = bounds_policy == out_of_bounds_policy::NULLIFY || - std::any_of(source_table.begin(), source_table.end(), [](auto const& col) { - return col.nullable(); - }); - if (nullable) { - auto const op = bounds_policy == out_of_bounds_policy::NULLIFY ? gather_bitmask_op::NULLIFY - : gather_bitmask_op::DONT_CHECK; - gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr); + auto needs_new_bitmask = bounds_policy == out_of_bounds_policy::NULLIFY || + cudf::has_nested_nullable_columns(source_table); + if (needs_new_bitmask) { + needs_new_bitmask = needs_new_bitmask || cudf::has_nested_nulls(source_table); + if (needs_new_bitmask) { + auto const op = bounds_policy == out_of_bounds_policy::NULLIFY + ? gather_bitmask_op::NULLIFY + : gather_bitmask_op::DONT_CHECK; + gather_bitmask(source_table, gather_map_begin, destination_columns, op, stream, mr); + } else { + for (size_type i = 0; i < source_table.num_columns(); ++i) { + set_all_valid_null_masks(source_table.column(i), *destination_columns[i], stream, mr); + } + } } return std::make_unique
(std::move(destination_columns)); diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 6532dae3695..4d261c54b29 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -56,10 +56,69 @@ namespace detail { * auto result = thrust::find(thrust::device, begin, end, size_type{12} ); * @endcode */ -using input_indexalator = input_normalator; +struct input_indexalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = cudf::size_type const; // this keeps STL and thrust happy + + input_indexalator() = default; + input_indexalator(input_indexalator const&) = default; + input_indexalator(input_indexalator&&) = default; + input_indexalator& operator=(input_indexalator const&) = default; + input_indexalator& operator=(input_indexalator&&) = default; + + /** + * @brief Indirection operator returns the value at the current iterator position + */ + __device__ inline cudf::size_type operator*() const { return operator[](0); } + + /** + * @brief Dispatch functor for resolving a Integer value from any integer type + */ + struct normalize_type { + template ())> + __device__ cudf::size_type operator()(void const* tp) + { + return static_cast(*static_cast(tp)); + } + template ())> + __device__ cudf::size_type operator()(void const*) + { + CUDF_UNREACHABLE("only integral types are supported"); + } + }; + + /** + * @brief Array subscript operator returns a value at the input + * `idx` position as a `Integer` value. + */ + __device__ inline cudf::size_type operator[](size_type idx) const + { + void const* tp = p_ + (idx * this->width_); + return type_dispatcher(this->dtype_, normalize_type{}, tp); + } + + /** + * @brief Create an input index normalizing iterator + * + * Use the indexalator_factory to create an iterator instance. + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + * @param offset Applied to the data pointer per size of the type + */ + CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0) + : base_normalator(dtype), p_{static_cast(data)} + { + p_ += offset * this->width_; + } + + protected: + char const* p_; /// pointer to the integer data in device memory +}; /** - * @brief The index normalizing output iterator. + * @brief The index normalizing output iterator * * This is an iterator that can be used for index types (integers) without * requiring a type-specific instance. It can be used for any iterator @@ -82,7 +141,75 @@ using input_indexalator = input_normalator; * thrust::less()); * @endcode */ -using output_indexalator = output_normalator; +struct output_indexalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = output_indexalator const&; // required for output iterators + + output_indexalator() = default; + output_indexalator(output_indexalator const&) = default; + output_indexalator(output_indexalator&&) = default; + output_indexalator& operator=(output_indexalator const&) = default; + output_indexalator& operator=(output_indexalator&&) = default; + + /** + * @brief Indirection operator returns this iterator instance in order + * to capture the `operator=(Integer)` calls. + */ + __device__ inline reference operator*() const { return *this; } + + /** + * @brief Array subscript operator returns an iterator instance at the specified `idx` position. + * + * This allows capturing the subsequent `operator=(Integer)` call in this class. + */ + __device__ inline output_indexalator const operator[](size_type idx) const + { + output_indexalator tmp{*this}; + tmp.p_ += (idx * this->width_); + return tmp; + } + + /** + * @brief Dispatch functor for setting the index value from a size_type value. + */ + struct normalize_type { + template ())> + __device__ void operator()(void* tp, cudf::size_type const value) + { + (*static_cast(tp)) = static_cast(value); + } + template ())> + __device__ void operator()(void*, cudf::size_type const) + { + CUDF_UNREACHABLE("only index types are supported"); + } + }; + + /** + * @brief Assign an Integer value to the current iterator position + */ + __device__ inline reference operator=(cudf::size_type const value) const + { + void* tp = p_; + type_dispatcher(this->dtype_, normalize_type{}, tp, value); + return *this; + } + + /** + * @brief Create an output normalizing iterator + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + */ + CUDF_HOST_DEVICE output_indexalator(void* data, data_type dtype) + : base_normalator(dtype), p_{static_cast(data)} + { + } + + protected: + char* p_; /// pointer to the integer data in device memory +}; /** * @brief Use this class to create an indexalator instance. @@ -92,14 +219,12 @@ struct indexalator_factory { * @brief A type_dispatcher functor to create an input iterator from an indices column. */ struct input_indexalator_fn { - template ()>* = nullptr> + template ())> input_indexalator operator()(column_view const& indices) { return input_indexalator(indices.data(), indices.type()); } - template ()>* = nullptr> + template ())> input_indexalator operator()(Args&&... args) { CUDF_FAIL("indices must be an index type"); @@ -110,16 +235,14 @@ struct indexalator_factory { * @brief Use this class to create an indexalator to a scalar index. */ struct input_indexalator_scalar_fn { - template ()>* = nullptr> + template ())> input_indexalator operator()(scalar const& index) { // note: using static_cast const&>(index) creates a copy auto const scalar_impl = static_cast const*>(&index); return input_indexalator(scalar_impl->data(), index.type()); } - template ()>* = nullptr> + template ())> input_indexalator operator()(Args&&... args) { CUDF_FAIL("scalar must be an index type"); @@ -130,14 +253,12 @@ struct indexalator_factory { * @brief A type_dispatcher functor to create an output iterator from an indices column. */ struct output_indexalator_fn { - template ()>* = nullptr> + template ())> output_indexalator operator()(mutable_column_view const& indices) { return output_indexalator(indices.data(), indices.type()); } - template ()>* = nullptr> + template ())> output_indexalator operator()(Args&&... args) { CUDF_FAIL("indices must be an index type"); diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 44024333239..8124471982d 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -194,5 +194,18 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, std::unique_ptr from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); + +/** + * @brief Return a maximum precision for a given type. + * + * @tparam T the type to get the maximum precision for + */ +template +constexpr std::size_t max_precision() +{ + auto constexpr num_bits = sizeof(T) * 8; + return std::floor(num_bits * std::log(2) / std::log(10)); +} + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh deleted file mode 100644 index e8e9b080a92..00000000000 --- a/cpp/include/cudf/detail/merge.cuh +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include -#include -#include - -namespace cudf { -namespace detail { -/** - * @brief Source table identifier to copy data from. - */ -enum class side : bool { LEFT, RIGHT }; - -/** - * @brief Tagged index type: `thrust::get<0>` indicates left/right side, - * `thrust::get<1>` indicates the row index - */ -using index_type = thrust::pair; - -/** - * @brief Vector of `index_type` values. - */ -using index_vector = rmm::device_uvector; - -/** - * @brief tagged_element_relational_comparator uses element_relational_comparator to provide - * "tagged-index" comparison logic. - * - * Special treatment is necessary in several thrust algorithms (e.g., merge()) where - * the index affinity to the side is not guaranteed; i.e., the algorithms rely on - * binary functors (predicates) where the operands may transparently switch sides. - * - * For example, - * thrust::merge(left_container, - * right_container, - * predicate(lhs, rhs){...}); - * can create 4 different use-cases, inside predicate(...): - * - * 1. lhs refers to the left container; rhs to the right container; - * 2. vice-versa; - * 3. both lhs and rhs actually refer to the left container; - * 4. both lhs and rhs actually refer to the right container; - * - * Because of that, one cannot rely on the predicate having *fixed* references to the containers. - * Each invocation may land in a different situation (among the 4 above) than any other invocation. - * Also, one cannot just manipulate lhs, rhs (indices) alone; because, if predicate always applies - * one index to one container and the other index to the other container, - * switching the indices alone won't suffice in the cases (3) or (4), - * where the also the containers must be changed (to just one instead of two) - * independently of indices; - * - * As a result, a special comparison logic is necessary whereby the index is "tagged" with side - * information and consequently comparator functors (predicates) must operate on these tagged - * indices rather than on raw indices. - */ -template -struct tagged_element_relational_comparator { - __host__ __device__ tagged_element_relational_comparator(column_device_view lhs, - column_device_view rhs, - null_order null_precedence) - : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence} - { - } - - [[nodiscard]] __device__ weak_ordering compare(index_type lhs_tagged_index, - index_type rhs_tagged_index) const noexcept - { - auto const [l_side, l_indx] = lhs_tagged_index; - auto const [r_side, r_indx] = rhs_tagged_index; - - column_device_view const* ptr_left_dview{l_side == side::LEFT ? &lhs : &rhs}; - column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs}; - - auto erl_comparator = element_relational_comparator( - nullate::DYNAMIC{has_nulls}, *ptr_left_dview, *ptr_right_dview, null_precedence); - - return cudf::type_dispatcher(lhs.type(), erl_comparator, l_indx, r_indx); - } - - private: - column_device_view lhs; - column_device_view rhs; - null_order null_precedence; -}; - -/** - * @brief The equivalent of `row_lexicographic_comparator` for tagged indices. - */ -template -struct row_lexicographic_tagged_comparator { - row_lexicographic_tagged_comparator(table_device_view lhs, - table_device_view rhs, - order const* column_order = nullptr, - null_order const* null_precedence = nullptr) - : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence} - { - // Add check for types to be the same. - CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns."); - } - - __device__ bool operator()(index_type lhs_tagged_index, - index_type rhs_tagged_index) const noexcept - { - for (size_type i = 0; i < _lhs.num_columns(); ++i) { - bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING); - - null_order null_precedence = - _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i]; - - auto comparator = tagged_element_relational_comparator{ - _lhs.column(i), _rhs.column(i), null_precedence}; - - weak_ordering state = comparator.compare(lhs_tagged_index, rhs_tagged_index); - - if (state == weak_ordering::EQUIVALENT) { continue; } - - return state == (ascending ? weak_ordering::LESS : weak_ordering::GREATER); - } - return false; - } - - private: - table_device_view _lhs; - table_device_view _rhs; - null_order const* _null_precedence{}; - order const* _column_order{}; -}; - -/** - * @copydoc std::unique_ptr merge( - * std::vector const& tables_to_merge, - * std::vector const& key_cols, - * std::vector const& column_order, - * std::vector const& null_precedence, - * rmm::mr::device_memory_resource* mr) - * - * @param stream CUDA stream used for device memory operations and kernel launches - */ -std::unique_ptr merge(std::vector const& tables_to_merge, - std::vector const& key_cols, - std::vector const& column_order, - std::vector const& null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); - -} // namespace detail -} // namespace cudf diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp new file mode 100644 index 00000000000..2167a484214 --- /dev/null +++ b/cpp/include/cudf/detail/merge.hpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace detail { + +/** + * @brief Source table identifier to copy data from. + */ +enum class side : bool { LEFT, RIGHT }; + +/** + * @brief Tagged index type: `thrust::get<0>` indicates left/right side, + * `thrust::get<1>` indicates the row index + */ +using index_type = thrust::pair; + +/** + * @brief Vector of `index_type` values. + */ +using index_vector = rmm::device_uvector; + +/** + * @copydoc std::unique_ptr merge( + * std::vector const& tables_to_merge, + * std::vector const& key_cols, + * std::vector const& column_order, + * std::vector const& null_precedence, + * rmm::mr::device_memory_resource* mr) + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr merge(std::vector const& tables_to_merge, + std::vector const& key_cols, + std::vector const& column_order, + std::vector const& null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh index 51b3133f84f..8f90afc3e57 100644 --- a/cpp/include/cudf/detail/normalizing_iterator.cuh +++ b/cpp/include/cudf/detail/normalizing_iterator.cuh @@ -33,8 +33,8 @@ namespace detail { * @tparam Integer The type the iterator normalizes to */ template -struct base_normalator { - static_assert(std::is_integral_v); +struct alignas(16) base_normalator { + static_assert(cudf::is_index_type()); using difference_type = std::ptrdiff_t; using value_type = Integer; using pointer = Integer*; @@ -202,165 +202,43 @@ struct base_normalator { return static_cast(*this).p_ >= rhs.p_; } - protected: - /** - * @brief Constructor assigns width and type member variables for base class. - */ - explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {} - - int width_; /// integer type width = 1,2,4, or 8 - data_type dtype_; /// for type-dispatcher calls -}; - -/** - * @brief The integer normalizing input iterator - * - * This is an iterator that can be used for index types (integers) without - * requiring a type-specific instance. It can be used for any iterator - * interface for reading an array of integer values of type - * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. - * Reading specific elements always return a type of `Integer` - * - * @tparam Integer Type returned by all read functions - */ -template -struct input_normalator : base_normalator, Integer> { - friend struct base_normalator, Integer>; // for CRTP - - using reference = Integer const; // this keeps STL and thrust happy - - input_normalator() = default; - input_normalator(input_normalator const&) = default; - input_normalator(input_normalator&&) = default; - input_normalator& operator=(input_normalator const&) = default; - input_normalator& operator=(input_normalator&&) = default; - - /** - * @brief Indirection operator returns the value at the current iterator position - */ - __device__ inline Integer operator*() const { return operator[](0); } - - /** - * @brief Dispatch functor for resolving a Integer value from any integer type - */ - struct normalize_type { - template >* = nullptr> - __device__ Integer operator()(void const* tp) - { - return static_cast(*static_cast(tp)); - } - template >* = nullptr> - __device__ Integer operator()(void const*) + private: + struct integer_sizeof_fn { + template ())> + CUDF_HOST_DEVICE constexpr std::size_t operator()() const { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("only integral types are supported"); +#else CUDF_UNREACHABLE("only integral types are supported"); +#endif } - }; - - /** - * @brief Array subscript operator returns a value at the input - * `idx` position as a `Integer` value. - */ - __device__ inline Integer operator[](size_type idx) const - { - void const* tp = p_ + (idx * this->width_); - return type_dispatcher(this->dtype_, normalize_type{}, tp); - } - - /** - * @brief Create an input index normalizing iterator. - * - * Use the indexalator_factory to create an iterator instance. - * - * @param data Pointer to an integer array in device memory. - * @param data_type Type of data in data - */ - input_normalator(void const* data, data_type dtype) - : base_normalator, Integer>(dtype), p_{static_cast(data)} - { - } - - char const* p_; /// pointer to the integer data in device memory -}; - -/** - * @brief The integer normalizing output iterator - * - * This is an iterator that can be used for index types (integers) without - * requiring a type-specific instance. It can be used for any iterator - * interface for writing an array of integer values of type - * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. - * Setting specific elements always accept the `Integer` type values. - * - * @tparam Integer The type used for all write functions - */ -template -struct output_normalator : base_normalator, Integer> { - friend struct base_normalator, Integer>; // for CRTP - - using reference = output_normalator const&; // required for output iterators - - output_normalator() = default; - output_normalator(output_normalator const&) = default; - output_normalator(output_normalator&&) = default; - output_normalator& operator=(output_normalator const&) = default; - output_normalator& operator=(output_normalator&&) = default; - - /** - * @brief Indirection operator returns this iterator instance in order - * to capture the `operator=(Integer)` calls. - */ - __device__ inline output_normalator const& operator*() const { return *this; } - - /** - * @brief Array subscript operator returns an iterator instance at the specified `idx` position. - * - * This allows capturing the subsequent `operator=(Integer)` call in this class. - */ - __device__ inline output_normalator const operator[](size_type idx) const - { - output_normalator tmp{*this}; - tmp.p_ += (idx * this->width_); - return tmp; - } - - /** - * @brief Dispatch functor for setting the index value from a size_type value. - */ - struct normalize_type { - template >* = nullptr> - __device__ void operator()(void* tp, Integer const value) - { - (*static_cast(tp)) = static_cast(value); - } - template >* = nullptr> - __device__ void operator()(void*, Integer const) + template ())> + CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept { - CUDF_UNREACHABLE("only index types are supported"); + return sizeof(T); } }; + protected: /** - * @brief Assign an Integer value to the current iterator position + * @brief Constructor assigns width and type member variables for base class. */ - __device__ inline output_normalator const& operator=(Integer const value) const + explicit CUDF_HOST_DEVICE base_normalator(data_type dtype) : dtype_(dtype) { - void* tp = p_; - type_dispatcher(this->dtype_, normalize_type{}, tp, value); - return *this; + width_ = static_cast(type_dispatcher(dtype, integer_sizeof_fn{})); } /** - * @brief Create an output normalizing iterator - * - * @param data Pointer to an integer array in device memory. - * @param data_type Type of data in data + * @brief Constructor assigns width and type member variables for base class. */ - output_normalator(void* data, data_type dtype) - : base_normalator, Integer>(dtype), p_{static_cast(data)} + explicit CUDF_HOST_DEVICE base_normalator(data_type dtype, int32_t width) + : width_(width), dtype_(dtype) { } - char* p_; /// pointer to the integer data in device memory + int32_t width_; /// integer type width = 1,2,4, or 8 + data_type dtype_; /// for type-dispatcher calls }; } // namespace detail diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 8c10bbe416f..74e2ccd2ea1 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include #include @@ -259,6 +260,22 @@ cudf::size_type inplace_bitmask_and(device_span dest_mask, size_type mask_size_bits, rmm::cuda_stream_view stream); +/** + * @brief Recursively set valid null masks for all children. + * + * This function applies all valid null masks to the output column if input column satisfies + * `nullable() == true` condition + * + * @param input input column to check for nullability + * @param output output column to mirror nullability of input + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ +void set_all_valid_null_masks(column_view const& input, + column& output, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh new file mode 100644 index 00000000000..3eb77b32353 --- /dev/null +++ b/cpp/include/cudf/detail/offsets_iterator.cuh @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace detail { + +/** + * @brief The offsets normalizing input iterator + * + * This is an iterator that can be used for offsets where the underlying + * type may be int32_t or int64_t. + * + * Use the offsetalator_factory to create an appropriate input iterator + * from an offsets column_view. + */ +struct input_offsetalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = int64_t const; // this keeps STL and thrust happy + + input_offsetalator() = default; + input_offsetalator(input_offsetalator const&) = default; + input_offsetalator(input_offsetalator&&) = default; + input_offsetalator& operator=(input_offsetalator const&) = default; + input_offsetalator& operator=(input_offsetalator&&) = default; + + /** + * @brief Indirection operator returns the value at the current iterator position + */ + __device__ inline int64_t operator*() const { return operator[](0); } + + /** + * @brief Array subscript operator returns a value at the input + * `idx` position as a int64_t value. + */ + __device__ inline int64_t operator[](size_type idx) const + { + void const* tp = p_ + (idx * this->width_); + return this->width_ == sizeof(int32_t) ? static_cast(*static_cast(tp)) + : *static_cast(tp); + } + + /** + * @brief Create an input index normalizing iterator. + * + * Use the indexalator_factory to create an iterator instance. + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + */ + CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype) + : base_normalator( + dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)), + p_{static_cast(data)} + { +#ifndef __CUDA_ARCH__ + CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64, + "Unexpected offsets type"); +#else + cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && + "Unexpected offsets type"); +#endif + } + + protected: + char const* p_; /// pointer to the integer data in device memory +}; + +/** + * @brief The offsets normalizing output iterator + * + * This is an iterator that can be used for storing offsets values + * where the underlying type may be either int32_t or int64_t. + * + * Use the offsetalator_factory to create an appropriate output iterator + * from a mutable_column_view. + * + */ +struct output_offsetalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = output_offsetalator const&; // required for output iterators + + output_offsetalator() = default; + output_offsetalator(output_offsetalator const&) = default; + output_offsetalator(output_offsetalator&&) = default; + output_offsetalator& operator=(output_offsetalator const&) = default; + output_offsetalator& operator=(output_offsetalator&&) = default; + + /** + * @brief Indirection operator returns this iterator instance in order + * to capture the `operator=(int64)` calls. + */ + __device__ inline output_offsetalator const& operator*() const { return *this; } + + /** + * @brief Array subscript operator returns an iterator instance at the specified `idx` position. + * + * This allows capturing the subsequent `operator=(int64)` call in this class. + */ + __device__ inline output_offsetalator const operator[](size_type idx) const + { + output_offsetalator tmp{*this}; + tmp.p_ += (idx * this->width_); + return tmp; + } + + /** + * @brief Assign an offset value to the current iterator position + */ + __device__ inline output_offsetalator const& operator=(int64_t const value) const + { + void* tp = p_; + if (this->width_ == sizeof(int32_t)) { + (*static_cast(tp)) = static_cast(value); + } else { + (*static_cast(tp)) = value; + } + return *this; + } + + /** + * @brief Create an output offsets iterator + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + */ + CUDF_HOST_DEVICE output_offsetalator(void* data, data_type dtype) + : base_normalator( + dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)), + p_{static_cast(data)} + { +#ifndef __CUDA_ARCH__ + CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64, + "Unexpected offsets type"); +#else + cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && + "Unexpected offsets type"); +#endif + } + + protected: + char* p_; /// pointer to the integer data in device memory +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh new file mode 100644 index 00000000000..5b4c6b825d2 --- /dev/null +++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace detail { + +/** + * @brief Use this class to create an offsetalator instance. + */ +struct offsetalator_factory { + /** + * @brief Create an input offsetalator instance from an offsets column + */ + static input_offsetalator make_input_iterator(column_view const& offsets) + { + return input_offsetalator(offsets.head(), offsets.type()); + } + + /** + * @brief Create an output offsetalator instance from an offsets column + */ + static output_offsetalator make_output_iterator(mutable_column_view const& offsets) + { + return output_offsetalator(offsets.head(), offsets.type()); + } +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp index 3fbdf4a5a8f..12f864de572 100644 --- a/cpp/include/cudf/detail/unary.hpp +++ b/cpp/include/cudf/detail/unary.hpp @@ -64,8 +64,6 @@ std::unique_ptr true_if(InputIterator begin, /** * @copydoc cudf::unary_operation - * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr unary_operation(cudf::column_view const& input, cudf::unary_operator op, @@ -74,8 +72,6 @@ std::unique_ptr unary_operation(cudf::column_view const& input, /** * @copydoc cudf::is_valid - * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr is_valid(cudf::column_view const& input, rmm::cuda_stream_view stream, @@ -83,8 +79,6 @@ std::unique_ptr is_valid(cudf::column_view const& input, /** * @copydoc cudf::cast - * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr cast(column_view const& input, data_type type, @@ -93,8 +87,6 @@ std::unique_ptr cast(column_view const& input, /** * @copydoc cudf::is_nan - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr is_nan(cudf::column_view const& input, rmm::cuda_stream_view stream, @@ -102,8 +94,6 @@ std::unique_ptr is_nan(cudf::column_view const& input, /** * @copydoc cudf::is_not_nan - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr is_not_nan(cudf::column_view const& input, rmm::cuda_stream_view stream, diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp index 9e2b85ea129..eee974c8399 100644 --- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp @@ -169,7 +169,12 @@ class pinned_allocator { * It is the responsibility of the caller to destroy * the objects stored at \p p. */ - __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); } + __host__ inline void deallocate(pointer p, size_type /*cnt*/) + { + auto dealloc_worked = cudaFreeHost(p); + (void)dealloc_worked; + assert(dealloc_worked == cudaSuccess); + } /** * @brief This method returns the maximum size of the \c cnt parameter diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp index e7ea53c740a..cad495d0097 100644 --- a/cpp/include/cudf/dictionary/detail/merge.hpp +++ b/cpp/include/cudf/dictionary/detail/merge.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include #include diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 865cc004107..2ee6f19614d 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -129,6 +129,12 @@ struct column_metadata { * @param stream CUDA stream used for device memory operations and kernel launches * @param ar_mr arrow memory pool to allocate memory for arrow Table * @return arrow Table generated from `input` + * + * @note For decimals, since the precision is not stored for them in libcudf, + * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type + * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision + * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be + * converted to Arrow decimal128 of the precision 38. */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata = {}, @@ -145,6 +151,12 @@ std::shared_ptr to_arrow(table_view input, * @param stream CUDA stream used for device memory operations and kernel launches * @param ar_mr arrow memory pool to allocate memory for arrow Scalar * @return arrow Scalar generated from `input` + * + * @note For decimals, since the precision is not stored for them in libcudf, + * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type + * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision + * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be + * converted to Arrow decimal128 of the precision 38. */ std::shared_ptr to_arrow(cudf::scalar const& input, column_metadata const& metadata = {}, diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp index 17c168f38d4..89207302850 100644 --- a/cpp/include/cudf/io/avro.hpp +++ b/cpp/include/cudf/io/avro.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,7 +56,7 @@ class avro_reader_options { * * @param src source information used to read avro file */ - explicit avro_reader_options(source_info const& src) : _source(src) {} + explicit avro_reader_options(source_info src) : _source{std::move(src)} {} friend avro_reader_options_builder; @@ -123,7 +123,7 @@ class avro_reader_options { * @param src source information used to read avro file * @returns builder to build reader options */ - static avro_reader_options_builder builder(source_info const& src); + static avro_reader_options_builder builder(source_info src); }; /** @@ -145,7 +145,7 @@ class avro_reader_options_builder { * * @param src The source information used to read avro file */ - explicit avro_reader_options_builder(source_info const& src) : options(src) {} + explicit avro_reader_options_builder(source_info src) : options{std::move(src)} {} /** * @brief Set names of the column to be read. diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index b49a13a8ea9..435583e805d 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -138,7 +138,7 @@ class csv_reader_options { * * @param src source information used to read csv file */ - explicit csv_reader_options(source_info const& src) : _source(src) {} + explicit csv_reader_options(source_info src) : _source{std::move(src)} {} friend csv_reader_options_builder; @@ -156,7 +156,7 @@ class csv_reader_options { * @param src Source information to read csv file * @return Builder to build reader options */ - static csv_reader_options_builder builder(source_info const& src); + static csv_reader_options_builder builder(source_info src); /** * @brief Returns source info. @@ -835,7 +835,7 @@ class csv_reader_options_builder { * * @param src The source information used to read csv file */ - csv_reader_options_builder(source_info const& src) : options(src) {} + csv_reader_options_builder(source_info src) : options{std::move(src)} {} /** * @brief Sets compression format of the source. @@ -1307,6 +1307,7 @@ class csv_reader_options_builder { * @endcode * * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata * @@ -1314,6 +1315,7 @@ class csv_reader_options_builder { */ table_with_metadata read_csv( csv_reader_options options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group @@ -1715,9 +1717,11 @@ class csv_writer_options_builder { * @endcode * * @param options Settings for controlling writing behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ void write_csv(csv_writer_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 9fdc7a47fb9..40ddcf385b0 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 6930a4fdb25..d0a9543397d 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 074f690d2c7..0b8ee9676de 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -38,7 +38,7 @@ class parquet_reader_options; class parquet_writer_options; class chunked_parquet_writer_options; -namespace detail::parquet { +namespace parquet::detail { /** * @brief Class to read Parquet dataset data into columns. @@ -186,7 +186,7 @@ class writer { */ explicit writer(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -201,7 +201,7 @@ class writer { */ explicit writer(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -250,5 +250,5 @@ class writer { * metadata. */ parquet_metadata read_parquet_metadata(host_span const> sources); -} // namespace detail::parquet +} // namespace parquet::detail } // namespace cudf::io diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index d408d249a7f..472d42b1db5 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -121,7 +121,7 @@ class json_reader_options { * * @param src source information used to read parquet file */ - explicit json_reader_options(source_info const& src) : _source(src) {} + explicit json_reader_options(source_info src) : _source{std::move(src)} {} friend json_reader_options_builder; @@ -139,7 +139,7 @@ class json_reader_options { * @param src source information used to read json file * @returns builder to build the options */ - static json_reader_options_builder builder(source_info const& src); + static json_reader_options_builder builder(source_info src); /** * @brief Returns source info. @@ -351,7 +351,7 @@ class json_reader_options_builder { * * @param src The source information used to read avro file */ - explicit json_reader_options_builder(source_info const& src) : options(src) {} + explicit json_reader_options_builder(source_info src) : options{std::move(src)} {} /** * @brief Set data types for columns to be read. @@ -512,6 +512,7 @@ class json_reader_options_builder { * @endcode * * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata. * @@ -519,6 +520,7 @@ class json_reader_options_builder { */ table_with_metadata read_json( json_reader_options options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group @@ -861,9 +863,11 @@ class json_writer_options_builder { * @endcode * * @param options Settings for controlling writing behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ void write_json(json_writer_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 024f4f23b94..c2762b05aa6 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -80,7 +80,7 @@ class orc_reader_options { * * @param src source information used to read orc file */ - explicit orc_reader_options(source_info const& src) : _source(src) {} + explicit orc_reader_options(source_info src) : _source{std::move(src)} {} public: /** @@ -96,7 +96,7 @@ class orc_reader_options { * @param src Source information to read orc file * @return Builder to build reader options */ - static orc_reader_options_builder builder(source_info const& src); + static orc_reader_options_builder builder(source_info src); /** * @brief Returns source info. @@ -269,7 +269,7 @@ class orc_reader_options_builder { * * @param src The source information used to read orc file */ - explicit orc_reader_options_builder(source_info const& src) : options{src} {}; + explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}; /** * @brief Sets names of the column to read. @@ -450,6 +450,8 @@ class orc_writer_options { std::map _user_data; // Optional compression statistics std::shared_ptr _compression_stats; + // Specify whether string dictionaries should be alphabetically sorted + bool _enable_dictionary_sort = true; friend orc_writer_options_builder; @@ -572,6 +574,13 @@ class orc_writer_options { return _compression_stats; } + /** + * @brief Returns whether string dictionaries should be sorted. + * + * @return `true` if string dictionaries should be sorted + */ + [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; } + // Setters /** @@ -670,6 +679,13 @@ class orc_writer_options { { _compression_stats = std::move(comp_stats); } + + /** + * @brief Sets whether string dictionaries should be sorted. + * + * @param val Boolean value to enable/disable + */ + void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; } }; /** @@ -810,6 +826,18 @@ class orc_writer_options_builder { return *this; } + /** + * @brief Sets whether string dictionaries should be sorted. + * + * @param val Boolean value to enable/disable + * @return this for chaining + */ + orc_writer_options_builder& enable_dictionary_sort(bool val) + { + options._enable_dictionary_sort = val; + return *this; + } + /** * @brief move orc_writer_options member once it's built. */ @@ -866,6 +894,8 @@ class chunked_orc_writer_options { std::map _user_data; // Optional compression statistics std::shared_ptr _compression_stats; + // Specify whether string dictionaries should be alphabetically sorted + bool _enable_dictionary_sort = true; friend chunked_orc_writer_options_builder; @@ -966,6 +996,13 @@ class chunked_orc_writer_options { return _compression_stats; } + /** + * @brief Returns whether string dictionaries should be sorted. + * + * @return `true` if string dictionaries should be sorted + */ + [[nodiscard]] bool get_enable_dictionary_sort() const { return _enable_dictionary_sort; } + // Setters /** @@ -1057,6 +1094,13 @@ class chunked_orc_writer_options { { _compression_stats = std::move(comp_stats); } + + /** + * @brief Sets whether string dictionaries should be sorted. + * + * @param val Boolean value to enable/disable + */ + void set_enable_dictionary_sort(bool val) { _enable_dictionary_sort = val; } }; /** @@ -1183,6 +1227,18 @@ class chunked_orc_writer_options_builder { return *this; } + /** + * @brief Sets whether string dictionaries should be sorted. + * + * @param val Boolean value to enable/disable + * @return this for chaining + */ + chunked_orc_writer_options_builder& enable_dictionary_sort(bool val) + { + options._enable_dictionary_sort = val; + return *this; + } + /** * @brief move chunked_orc_writer_options member once it's built. */ diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 82d59803c25..9531a012e49 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -141,10 +141,10 @@ using binary_statistics = sum_statistics; * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC. */ struct timestamp_statistics : minmax_statistics { - std::optional minimum_utc; ///< minimum in milliseconds - std::optional maximum_utc; ///< maximum in milliseconds - std::optional minimum_nanos; ///< nanoseconds part of the minimum - std::optional maximum_nanos; ///< nanoseconds part of the maximum + std::optional minimum_utc; ///< minimum in milliseconds + std::optional maximum_utc; ///< maximum in milliseconds + std::optional minimum_nanos; ///< nanoseconds part of the minimum + std::optional maximum_nanos; ///< nanoseconds part of the maximum }; namespace orc { diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index deaf23d405a..ea18da74d5a 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -80,7 +80,7 @@ class parquet_reader_options { * * @param src source information used to read parquet file */ - explicit parquet_reader_options(source_info const& src) : _source(src) {} + explicit parquet_reader_options(source_info src) : _source{std::move(src)} {} friend parquet_reader_options_builder; @@ -98,7 +98,7 @@ class parquet_reader_options { * @param src Source information to read parquet file * @return Builder to build reader options */ - static parquet_reader_options_builder builder(source_info const& src); + static parquet_reader_options_builder builder(source_info src); /** * @brief Returns source info. @@ -265,7 +265,7 @@ class parquet_reader_options_builder { * * @param src The source information used to read parquet file */ - explicit parquet_reader_options_builder(source_info const& src) : options(src) {} + explicit parquet_reader_options_builder(source_info src) : options{std::move(src)} {} /** * @brief Sets names of the columns to be read. @@ -499,7 +499,7 @@ class chunked_parquet_reader { [[nodiscard]] table_with_metadata read_chunk() const; private: - std::unique_ptr reader; + std::unique_ptr reader; }; /** @} */ // end of group @@ -532,6 +532,9 @@ class parquet_writer_options { // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. // If true then overrides any per-column setting in _metadata. bool _write_timestamps_as_int96 = false; + // Parquet writer can write timestamps as UTC + // Defaults to true because libcudf timestamps are implicitly UTC + bool _write_timestamps_as_UTC = true; // Column chunks file paths to be set in the raw output metadata. One per output file std::vector _column_chunks_file_paths; // Maximum size of each row group (unless smaller than a single page) @@ -652,6 +655,13 @@ class parquet_writer_options { */ bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } + /** + * @brief Returns `true` if timestamps will be written as UTC + * + * @return `true` if timestamps will be written as UTC + */ + [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } + /** * @brief Returns Column chunks file paths to be set in the raw output metadata. * @@ -789,6 +799,13 @@ class parquet_writer_options { */ void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; } + /** + * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`. + * + * @param val Boolean value to enable/disable writing of timestamps as UTC. + */ + void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } + /** * @brief Sets column chunks file path to be set in the raw output metadata. * @@ -1100,6 +1117,18 @@ class parquet_writer_options_builder { return *this; } + /** + * @brief Set to true if timestamps are to be written as UTC. + * + * @param enabled Boolean value to enable/disable writing of timestamps as UTC. + * @return this for chaining + */ + parquet_writer_options_builder& utc_timestamps(bool enabled) + { + options._write_timestamps_as_UTC = enabled; + return *this; + } + /** * @brief Set to true if V2 page headers are to be written. * @@ -1171,6 +1200,8 @@ class chunked_parquet_writer_options { // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. // If true then overrides any per-column setting in _metadata. bool _write_timestamps_as_int96 = false; + // Parquet writer can write timestamps as UTC. Defaults to true. + bool _write_timestamps_as_UTC = true; // Maximum size of each row group (unless smaller than a single page) size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) @@ -1254,6 +1285,13 @@ class chunked_parquet_writer_options { */ bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } + /** + * @brief Returns `true` if timestamps will be written as UTC + * + * @return `true` if timestamps will be written as UTC + */ + [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; } + /** * @brief Returns maximum row group size, in bytes. * @@ -1375,6 +1413,13 @@ class chunked_parquet_writer_options { */ void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; } + /** + * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`. + * + * @param val Boolean value to enable/disable writing of timestamps as UTC. + */ + void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; } + /** * @brief Sets the maximum row group size, in bytes. * @@ -1539,6 +1584,18 @@ class chunked_parquet_writer_options_builder { return *this; } + /** + * @brief Set to true if timestamps are to be written as UTC. + * + * @param enabled Boolean value to enable/disable writing of timestamps as UTC. + * @return this for chaining + */ + chunked_parquet_writer_options_builder& utc_timestamps(bool enabled) + { + options._write_timestamps_as_UTC = enabled; + return *this; + } + /** * @brief Set to true if V2 page headers are to be written. * @@ -1750,7 +1807,7 @@ class parquet_chunked_writer { std::vector const& column_chunks_file_paths = {}); /// Unique pointer to impl writer class - std::unique_ptr writer; + std::unique_ptr writer; }; /** @} */ // end of group diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index a97f81182ac..50119e60882 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -195,9 +195,9 @@ class writer_compression_statistics { * @brief Control use of dictionary encoding for parquet writer */ enum dictionary_policy { - NEVER, ///< Never use dictionary encoding - ADAPTIVE, ///< Use dictionary when it will not impact compression - ALWAYS ///< Use dictionary reqardless of impact on compression + NEVER = 0, ///< Never use dictionary encoding + ADAPTIVE = 1, ///< Use dictionary when it will not impact compression + ALWAYS = 2 ///< Use dictionary regardless of impact on compression }; /** @@ -293,14 +293,20 @@ struct source_info { * * @param file_paths Input files paths */ - explicit source_info(std::vector const& file_paths) : _filepaths(file_paths) {} + explicit source_info(std::vector const& file_paths) + : _type(io_type::FILEPATH), _filepaths(file_paths) + { + } /** * @brief Construct a new source info object for a single file * * @param file_path Single input file */ - explicit source_info(std::string const& file_path) : _filepaths({file_path}) {} + explicit source_info(std::string const& file_path) + : _type(io_type::FILEPATH), _filepaths({file_path}) + { + } /** * @brief Construct a new source info object for multiple buffers in host memory @@ -444,7 +450,7 @@ struct source_info { [[nodiscard]] auto const& user_sources() const { return _user_sources; } private: - io_type _type = io_type::FILEPATH; + io_type _type = io_type::VOID; std::vector _filepaths; std::vector> _host_buffers; std::vector> _device_buffers; diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/json/json.hpp similarity index 94% rename from cpp/include/cudf/strings/json.hpp rename to cpp/include/cudf/json/json.hpp index 8fabee6b9a5..944e0c26dd6 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/json/json.hpp @@ -16,16 +16,16 @@ #pragma once #include +#include #include #include namespace cudf { -namespace strings { /** - * @addtogroup strings_json + * @addtogroup json_object * @{ * @file */ @@ -155,20 +155,21 @@ class get_json_object_options { * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html * Implements only the operators: $ . [] * * + * @throw std::invalid_argument if provided an invalid operator or an empty name + * * @param col The input strings column. Each row must contain a valid json string * @param json_path The JSONPath string to be applied to each row * @param options Options for controlling the behavior of the function - * @param mr Resource for allocating device memory. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Resource for allocating device memory * @return New strings column containing the retrieved json object strings - * - * @throw std::invalid_argument if provided an invalid operator or an empty name */ std::unique_ptr get_json_object( cudf::strings_column_view const& col, cudf::string_scalar const& json_path, get_json_object_options options = get_json_object_options{}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group -} // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp index 0bc76828fc3..0d9c1c157eb 100644 --- a/cpp/include/cudf/lists/combine.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input table. @@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; std::unique_ptr concatenate_rows( table_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -86,6 +88,7 @@ std::unique_ptr concatenate_rows( * @param input The lists column containing lists of list elements to concatenate. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input lists column. @@ -93,6 +96,7 @@ std::unique_ptr concatenate_rows( std::unique_ptr concatenate_list_elements( column_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp index 21c2ca1d64e..7cf67ec9205 100644 --- a/cpp/include/cudf/lists/contains.hpp +++ b/cpp/include/cudf/lists/contains.hpp @@ -42,12 +42,14 @@ namespace lists { * * @param lists Lists column whose `n` rows are to be searched * @param search_key The scalar key to be looked up in each list row + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -63,13 +65,15 @@ std::unique_ptr contains( * 2. The list row `lists[i]` is null * * @param lists Lists column whose `n` rows are to be searched - * @param search_keys Column of elements to be looked up in each list row + * @param search_keys Column of elements to be looked up in each list row. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -84,12 +88,14 @@ std::unique_ptr contains( * A row with an empty list will always return false. * Nulls inside non-null nested elements (such as lists or structs) are not considered. * - * @param lists Lists column whose `n` rows are to be searched + * @param lists Lists column whose `n` rows are to be searched. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains_nulls( cudf::lists_column_view const& lists, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t { * @param search_key The scalar key to be looked up in each list row * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -132,6 +139,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -160,6 +168,7 @@ std::unique_ptr index_of( * `lists` * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -167,6 +176,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp index 552ba058b93..e4bd0dca9ae 100644 --- a/cpp/include/cudf/lists/count_elements.hpp +++ b/cpp/include/cudf/lists/count_elements.hpp @@ -45,11 +45,13 @@ namespace lists { * in the output column. * * @param input Input lists column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with the number of elements for each row */ std::unique_ptr count_elements( lists_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of lists_elements group diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index f04b2fda2bf..ff148c59a23 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -20,9 +20,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -130,8 +130,8 @@ std::unique_ptr scatter_impl(rmm::device_uvector cons std::vector> children; children.emplace_back(std::move(offsets_column)); children.emplace_back(std::move(child_column)); - auto null_mask = - target.has_nulls() ? copy_bitmask(target, stream, mr) : rmm::device_buffer{0, stream, mr}; + auto null_mask = target.has_nulls() ? cudf::detail::copy_bitmask(target, stream, mr) + : rmm::device_buffer{0, stream, mr}; // The output column from this function only has null masks copied from the target columns. // That is still not a correct final null mask for the scatter result. diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp index e92354134e8..14c0f59e17d 100644 --- a/cpp/include/cudf/lists/extract.hpp +++ b/cpp/include/cudf/lists/extract.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,12 +59,14 @@ namespace lists { * * @param lists_column Column to extract elements from. * @param index The row within each sublist to retrieve. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return Column of extracted elements. */ std::unique_ptr extract_list_element( lists_column_view const& lists_column, size_type index, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -97,6 +99,7 @@ std::unique_ptr extract_list_element( * @param lists_column Column to extract elements from. * @param indices The column whose rows indicate the element index to be retrieved from each list * row. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return Column of extracted elements. * @throws cudf::logic_error If the sizes of `lists_column` and `indices` do not match. @@ -104,6 +107,7 @@ std::unique_ptr extract_list_element( std::unique_ptr extract_list_element( lists_column_view const& lists_column, column_view const& indices, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp index 059ed5ffd33..3730e16482d 100644 --- a/cpp/include/cudf/lists/filling.hpp +++ b/cpp/include/cudf/lists/filling.hpp @@ -17,7 +17,9 @@ #pragma once #include +#include +#include #include #include @@ -57,12 +59,14 @@ namespace cudf::lists { * * @param starts First values in the result sequences. * @param sizes Numbers of values in the result sequences. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return The result column containing generated sequences. */ std::unique_ptr sequences( column_view const& starts, column_view const& sizes, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -96,6 +100,7 @@ std::unique_ptr sequences( * @param starts First values in the result sequences. * @param steps Increment values for the result sequences. * @param sizes Numbers of values in the result sequences. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return The result column containing generated sequences. */ @@ -103,6 +108,7 @@ std::unique_ptr sequences( column_view const& starts, column_view const& steps, column_view const& sizes, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp index 38bed9ede43..5e6ab6816e6 100644 --- a/cpp/include/cudf/lists/gather.hpp +++ b/cpp/include/cudf/lists/gather.hpp @@ -65,6 +65,7 @@ namespace lists { * @param bounds_policy Can be `DONT_CHECK` or `NULLIFY`. Selects whether or not to nullify the * output list row's element, when the gather index falls outside the range `[-n, n)`, * where `n` is the number of elements in list row corresponding to the gather-map row. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource to allocate any returned objects * @return column with elements in list of rows gathered based on `gather_map_list` * @@ -73,6 +74,7 @@ std::unique_ptr segmented_gather( lists_column_view const& source_column, lists_column_view const& gather_map_list, out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp index 226d417c53a..864cd796f72 100644 --- a/cpp/include/cudf/lists/reverse.hpp +++ b/cpp/include/cudf/lists/reverse.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,11 +42,13 @@ namespace cudf::lists { * @endcode * * @param input Lists column for this operation + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New lists column with reversed lists */ std::unique_ptr reverse( lists_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp index 9d58d0f5b98..6fb8989f0bb 100644 --- a/cpp/include/cudf/lists/set_operations.hpp +++ b/cpp/include/cudf/lists/set_operations.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,6 +53,7 @@ namespace cudf::lists { * to be `UNEQUAL` which means only non-null elements are checked for overlapping * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal * @param mr Device memory resource used to allocate the returned object + * @param stream CUDA stream used for device memory operations and kernel launches * @return A column of type BOOL containing the check results */ std::unique_ptr have_overlap( @@ -60,6 +61,7 @@ std::unique_ptr have_overlap( lists_column_view const& rhs, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -87,6 +89,7 @@ std::unique_ptr have_overlap( * @param rhs The input lists column for the other side * @param nulls_equal Flag to specify whether null elements should be considered as equal * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned object * @return A lists column containing the intersection results */ @@ -95,6 +98,7 @@ std::unique_ptr intersect_distinct( lists_column_view const& rhs, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -122,6 +126,7 @@ std::unique_ptr intersect_distinct( * @param rhs The input lists column for the other side * @param nulls_equal Flag to specify whether null elements should be considered as equal * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned object * @return A lists column containing the union results */ @@ -130,6 +135,7 @@ std::unique_ptr union_distinct( lists_column_view const& rhs, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -157,6 +163,7 @@ std::unique_ptr union_distinct( * @param rhs The input lists column of elements to exclude * @param nulls_equal Flag to specify whether null elements should be considered as equal * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned object * @return A lists column containing the difference results */ @@ -165,6 +172,7 @@ std::unique_ptr difference_distinct( lists_column_view const& rhs, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp index c203c452b0d..39a52c75a98 100644 --- a/cpp/include/cudf/lists/sorting.hpp +++ b/cpp/include/cudf/lists/sorting.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,6 +46,7 @@ namespace lists { * @param source_column View of the list column of numeric types to sort * @param column_order The desired sort order * @param null_precedence The desired order of null compared to other elements in the list + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to allocate any returned objects * @return list column with elements in each list sorted. * @@ -54,6 +55,7 @@ std::unique_ptr sort_lists( lists_column_view const& source_column, order column_order, null_order null_precedence, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -66,6 +68,7 @@ std::unique_ptr stable_sort_lists( lists_column_view const& source_column, order column_order, null_order null_precedence, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp index 5ddaa992184..3ac4f6861ec 100644 --- a/cpp/include/cudf/lists/stream_compaction.hpp +++ b/cpp/include/cudf/lists/stream_compaction.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,12 +54,14 @@ namespace cudf::lists { * * @param input The input list column view to be filtered * @param boolean_mask A nullable list of bools column used to filter `input` elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return List column of the same type as `input`, containing filtered list rows */ std::unique_ptr apply_boolean_mask( lists_column_view const& input, lists_column_view const& boolean_mask, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -78,6 +80,7 @@ std::unique_ptr apply_boolean_mask( * @param input The input lists column * @param nulls_equal Flag to specify whether null elements should be considered as equal * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned object * @return The resulting lists column containing lists without duplicates */ @@ -85,6 +88,7 @@ std::unique_ptr distinct( lists_column_view const& input, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp index 3d09550209d..8886ec24bfe 100644 --- a/cpp/include/cudf/merge.hpp +++ b/cpp/include/cudf/merge.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,10 @@ namespace cudf { * @brief Merge a set of sorted tables. * * Merges sorted tables into one sorted table - * containing data from all tables. + * containing data from all tables. The key columns + * of each table must be sorted according to the + * parameters (cudf::column_order and cudf::null_order) + * specified for that column. * * ``` * Example 1: diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index 672f479ad53..524296e60ca 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -80,6 +81,7 @@ size_type num_bitmask_words(size_type number_of_bits); * * @param size The number of elements to be represented by the mask * @param state The desired state of the mask + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` for use as a null bitmask * satisfying the desired size and state @@ -87,6 +89,7 @@ size_type num_bitmask_words(size_type number_of_bits); rmm::device_buffer create_null_mask( size_type size, mask_state state, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -100,8 +103,13 @@ rmm::device_buffer create_null_mask( * @param begin_bit Index of the first bit to set (inclusive) * @param end_bit Index of the last bit to set (exclusive) * @param valid If true set all entries to valid; otherwise, set all to null + * @param stream CUDA stream used for device memory operations and kernel launches */ -void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid); +void set_null_mask(bitmask_type* bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Creates a `device_buffer` from a slice of bitmask defined by a range @@ -115,6 +123,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit * @param mask Bitmask residing in device memory whose bits will be copied * @param begin_bit Index of the first bit to be copied (inclusive) * @param end_bit Index of the last bit to be copied (exclusive) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` containing the bits * `[begin_bit, end_bit)` from `mask`. @@ -123,6 +132,7 @@ rmm::device_buffer copy_bitmask( bitmask_type const* mask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -132,12 +142,14 @@ rmm::device_buffer copy_bitmask( * Returns empty `device_buffer` if the column is not nullable * * @param view Column view whose bitmask needs to be copied + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A `device_buffer` containing the bits * `[view.offset(), view.offset() + view.size())` from `view`'s bitmask. */ rmm::device_buffer copy_bitmask( column_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -148,11 +160,13 @@ rmm::device_buffer copy_bitmask( * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_and( table_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -163,11 +177,13 @@ std::pair bitmask_and( * If no column in the table is nullable, an empty bitmask is returned. * * @param view The table of columns + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned device_buffer * @return A pair of resulting bitmask and count of unset bits */ std::pair bitmask_or( table_view const& view, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -183,8 +199,12 @@ std::pair bitmask_or( * @param bitmask Validity bitmask residing in device memory. * @param start Index of the first bit to count (inclusive). * @param stop Index of the last bit to count (exclusive). + * @param stream CUDA stream used for device memory operations and kernel launches * @return The number of null elements in the specified range. */ -cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop); +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** @} */ // end of group } // namespace cudf diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 8b6c434719a..c6db5dab08a 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,18 +53,20 @@ namespace strings { * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param types The character types to check in each string. + * @param input Strings instance for this operation + * @param types The character types to check in each string * @param verify_types Only verify against these character types. * Default `ALL_TYPES` means return `true` * iff all characters match `types`. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr all_characters_of_type( - strings_column_view const& strings, + strings_column_view const& input, string_character_types types, string_character_types verify_types = string_character_types::ALL_TYPES, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -96,20 +98,22 @@ std::unique_ptr all_characters_of_type( * @throw cudf::logic_error if neither or both `types_to_remove` and * `types_to_keep` are set to `ALL_TYPES`. * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param types_to_remove The character types to check in each string. * Use `ALL_TYPES` here to specify `types_to_keep` instead. - * @param replacement The replacement character to use when removing characters. + * @param replacement The replacement character to use when removing characters * @param types_to_keep Default `ALL_TYPES` means all characters of * `types_to_remove` will be filtered. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @return New column of boolean results for each string */ std::unique_ptr filter_characters_of_type( - strings_column_view const& strings, + strings_column_view const& input, string_character_types types_to_remove, string_scalar const& replacement = string_scalar(""), string_character_types types_to_keep = string_character_types::ALL_TYPES, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 71f65ac9080..568e8ac50ec 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -66,18 +66,20 @@ enum class output_if_empty_list { * * @throw cudf::logic_error if separator is not valid. * - * @param strings Strings for this operation. + * @param input Strings for this operation * @param separator String that should inserted between each string. * Default is an empty string. - * @param narep String that should represent any null strings found. + * @param narep String to replace any null strings found. * Default of invalid-scalar will ignore any null entries. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column containing one string. */ std::unique_ptr join_strings( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -127,18 +129,17 @@ std::unique_ptr join_strings( * @throw cudf::logic_error if the number of rows from @p separators and @p strings_columns * do not match * - * @param strings_columns List of strings columns to concatenate. + * @param strings_columns List of strings columns to concatenate * @param separators Strings column that provides the separator for a given row - * @param separator_narep String that should be used in place of a null separator for a given - * row. Default of invalid-scalar means no row separator value replacements. - * Default is an invalid string. - * @param col_narep String that should be used in place of any null strings - * found in any column. Default of invalid-scalar means no null column value replacements. - * Default is an invalid string. + * @param separator_narep String to replace a null separator for a given row. + * Default of invalid-scalar means no row separator value replacements. + * @param col_narep String that should be used in place of any null strings found in any column. + * Default of invalid-scalar means no null column value replacements. * @param separate_nulls If YES, then the separator is included for null rows * if `col_narep` is valid. - * @param mr Resource for allocating device memory. - * @return New column with concatenated results. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Resource for allocating device memory + * @return New column with concatenated results */ std::unique_ptr concatenate( table_view const& strings_columns, @@ -146,6 +147,7 @@ std::unique_ptr concatenate( string_scalar const& separator_narep = string_scalar("", false), string_scalar const& col_narep = string_scalar("", false), separator_on_nulls separate_nulls = separator_on_nulls::YES, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -184,21 +186,23 @@ std::unique_ptr concatenate( * @throw cudf::logic_error if separator is not valid. * @throw cudf::logic_error if only one column is specified * - * @param strings_columns List of string columns to concatenate. + * @param strings_columns List of string columns to concatenate * @param separator String that should inserted between each string from each row. * Default is an empty string. - * @param narep String that should be used in place of any null strings - * found in any column. Default of invalid-scalar means any null entry in any column will + * @param narep String to replace any null strings found in any column. + * Default of invalid-scalar means any null entry in any column will * produces a null result for that row. - * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with concatenated results. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with concatenated results */ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), separator_on_nulls separate_nulls = separator_on_nulls::YES, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -243,19 +247,20 @@ std::unique_ptr concatenate( * @throw cudf::logic_error if the number of rows from `separators` and `lists_strings_column` do * not match * - * @param lists_strings_column Column containing lists of strings to concatenate. - * @param separators Strings column that provides separators for concatenation. - * @param separator_narep String that should be used to replace null separator, default is an - * invalid-scalar denoting that rows containing null separator will result in null string in - * the corresponding output rows. - * @param string_narep String that should be used to replace null strings in any non-null list row, - * default is an invalid-scalar denoting that list rows containing null strings will result - * in null string in the corresponding output rows. - * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. - * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will + * @param lists_strings_column Column containing lists of strings to concatenate + * @param separators Strings column that provides separators for concatenation + * @param separator_narep String that should be used to replace a null separator. + * Default is an invalid-scalar denoting that rows containing null separator will result in + * a null string in the corresponding output rows. + * @param string_narep String to replace null strings in any non-null list row. + * Default is an invalid-scalar denoting that list rows containing null strings will result + * in a null string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid + * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will * result in an empty string. Otherwise, it will result in a null. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with concatenated results. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with concatenated results */ std::unique_ptr join_list_elements( lists_column_view const& lists_strings_column, @@ -264,6 +269,7 @@ std::unique_ptr join_list_elements( string_scalar const& string_narep = string_scalar("", false), separator_on_nulls separate_nulls = separator_on_nulls::YES, output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -303,17 +309,18 @@ std::unique_ptr join_list_elements( * @throw cudf::logic_error if input column is not lists of strings column. * @throw cudf::logic_error if separator is not valid. * - * @param lists_strings_column Column containing lists of strings to concatenate. - * @param separator String that should inserted between strings of each list row, default is an - * empty string. - * @param narep String that should be used to replace null strings in any non-null list row, default - * is an invalid-scalar denoting that list rows containing null strings will result in null - * string in the corresponding output rows. - * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. - * @param empty_list_policy if set to EMPTY_STRING, any input row that is an empty list will result + * @param lists_strings_column Column containing lists of strings to concatenate + * @param separator String to insert between strings of each list row. + * Default is an empty string. + * @param narep String to replace null strings in any non-null list row. + * Default is an invalid-scalar denoting that list rows containing null strings will result + * in a null string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid + * @param empty_list_policy If set to EMPTY_STRING, any input row that is an empty list will result * in an empty string. Otherwise, it will result in a null. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with concatenated results. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with concatenated results */ std::unique_ptr join_list_elements( lists_column_view const& lists_strings_column, @@ -321,6 +328,7 @@ std::unique_ptr join_list_elements( string_scalar const& narep = string_scalar("", false), separator_on_nulls separate_nulls = separator_on_nulls::YES, output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp index 23c77cb60da..341c146df92 100644 --- a/cpp/include/cudf/strings/contains.hpp +++ b/cpp/include/cudf/strings/contains.hpp @@ -31,7 +31,7 @@ struct regex_program; * @addtogroup strings_contains * @{ * @file strings/contains.hpp - * @brief Strings APIs for regex contains, count, matches + * @brief Strings APIs for regex contains, count, matches, like */ /** @@ -50,14 +50,16 @@ struct regex_program; * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of boolean results for each string */ std::unique_ptr contains_re( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -76,14 +78,16 @@ std::unique_ptr contains_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of boolean results for each string */ std::unique_ptr matches_re( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -102,14 +106,16 @@ std::unique_ptr matches_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of match counts for each string */ std::unique_ptr count_re( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -146,8 +152,9 @@ std::unique_ptr count_re( * * @param input Strings instance for this operation * @param pattern Like pattern to match within each string - * @param escape_character Optional character specifies the escape prefix; - * default is no escape character + * @param escape_character Optional character specifies the escape prefix. + * Default is no escape character. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New boolean column */ @@ -155,6 +162,7 @@ std::unique_ptr like( strings_column_view const& input, string_scalar const& pattern, string_scalar const& escape_character = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -185,8 +193,9 @@ std::unique_ptr like( * * @param input Strings instance for this operation * @param patterns Like patterns to match within each corresponding string - * @param escape_character Optional character specifies the escape prefix; - * default is no escape character + * @param escape_character Optional character specifies the escape prefix. + * Default is no escape character. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New boolean column */ @@ -194,6 +203,7 @@ std::unique_ptr like( strings_column_view const& input, strings_column_view const& patterns, string_scalar const& escape_character = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp index ab63503f166..9e9f25e800a 100644 --- a/cpp/include/cudf/strings/convert/convert_booleans.hpp +++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,14 +35,16 @@ namespace strings { * * Any null entries will result in corresponding null entries in the output column. * - * @param strings Strings instance for this operation. - * @param true_string String to expect for true. Non-matching strings are false. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New BOOL8 column converted from strings. + * @param input Strings instance for this operation + * @param true_string String to expect for true. Non-matching strings are false + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column converted from strings */ std::unique_ptr to_booleans( - strings_column_view const& strings, - string_scalar const& true_string = string_scalar("true"), + strings_column_view const& input, + string_scalar const& true_string, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -53,16 +55,18 @@ std::unique_ptr to_booleans( * * @throw cudf::logic_error if the input column is not BOOL8 type. * - * @param booleans Boolean column to convert. - * @param true_string String to use for true in the output column. - * @param false_string String to use for false in the output column. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param booleans Boolean column to convert + * @param true_string String to use for true in the output column + * @param false_string String to use for false in the output column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr from_booleans( column_view const& booleans, - string_scalar const& true_string = string_scalar("true"), - string_scalar const& false_string = string_scalar("false"), + string_scalar const& true_string, + string_scalar const& false_string, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp index fa729d26734..81cce14b53b 100644 --- a/cpp/include/cudf/strings/convert/convert_datetime.hpp +++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp @@ -77,16 +77,18 @@ namespace strings { * * @throw cudf::logic_error if timestamp_type is not a timestamp type. * - * @param strings Strings instance for this operation. - * @param timestamp_type The timestamp type used for creating the output column. - * @param format String specifying the timestamp format in strings. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New datetime column. + * @param input Strings instance for this operation + * @param timestamp_type The timestamp type used for creating the output column + * @param format String specifying the timestamp format in strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New datetime column */ std::unique_ptr to_timestamps( - strings_column_view const& strings, + strings_column_view const& input, data_type timestamp_type, std::string_view format, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -124,14 +126,16 @@ std::unique_ptr to_timestamps( * This will return a column of type BOOL8 where a `true` row indicates the corresponding * input string can be parsed correctly with the given format. * - * @param strings Strings instance for this operation. - * @param format String specifying the timestamp format in strings. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New BOOL8 column. + * @param input Strings instance for this operation + * @param format String specifying the timestamp format in strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New BOOL8 column */ std::unique_ptr is_timestamp( - strings_column_view const& strings, + strings_column_view const& input, std::string_view format, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -231,19 +235,21 @@ std::unique_ptr is_timestamp( * @throw cudf::logic_error if the `format` string is empty * @throw cudf::logic_error if `names.size()` is an invalid size. Must be 0 or 40 strings. * - * @param timestamps Timestamp values to convert. + * @param timestamps Timestamp values to convert * @param format The string specifying output format. * Default format is "%Y-%m-%dT%H:%M:%SZ". * @param names The string names to use for weekdays ("%a", "%A") and months ("%b", "%B") * Default is an empty `strings_column_view`. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with formatted timestamps. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with formatted timestamps */ std::unique_ptr from_timestamps( column_view const& timestamps, std::string_view format = "%Y-%m-%dT%H:%M:%SZ", strings_column_view const& names = strings_column_view(column_view{ data_type{type_id::STRING}, 0, nullptr, nullptr, 0}), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp index e915ec26279..a1f4e4ead1d 100644 --- a/cpp/include/cudf/strings/convert/convert_durations.hpp +++ b/cpp/include/cudf/strings/convert/convert_durations.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -65,16 +65,18 @@ namespace strings { * * @throw cudf::logic_error if duration_type is not a duration type. * - * @param strings Strings instance for this operation. - * @param duration_type The duration type used for creating the output column. - * @param format String specifying the duration format in strings. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New duration column. + * @param input Strings instance for this operation + * @param duration_type The duration type used for creating the output column + * @param format String specifying the duration format in strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New duration column */ std::unique_ptr to_durations( - strings_column_view const& strings, + strings_column_view const& input, data_type duration_type, std::string_view format, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -115,15 +117,17 @@ std::unique_ptr to_durations( * * @throw cudf::logic_error if `durations` column parameter is not a duration type. * - * @param durations Duration values to convert. + * @param durations Duration values to convert * @param format The string specifying output format. - * Default format is ""%d days %H:%M:%S". - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with formatted durations. + * Default format is ""%D days %H:%M:%S". + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches + * @return New strings column with formatted durations */ std::unique_ptr from_durations( column_view const& durations, std::string_view format = "%D days %H:%M:%S", + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index 3852dc8e81a..8f37715967a 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,14 +53,16 @@ namespace strings { * * @throw cudf::logic_error if `output_type` is not a fixed-point decimal type. * - * @param input Strings instance for this operation. - * @param output_type Type of fixed-point column to return including the scale value. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of `output_type`. + * @param input Strings instance for this operation + * @param output_type Type of fixed-point column to return including the scale value + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of `output_type` */ std::unique_ptr to_fixed_point( strings_column_view const& input, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -83,12 +85,14 @@ std::unique_ptr to_fixed_point( * * @throw cudf::logic_error if the `input` column is not a fixed-point decimal type. * - * @param input Fixed-point column to convert. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param input Fixed-point column to convert + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr from_fixed_point( column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -111,14 +115,16 @@ std::unique_ptr from_fixed_point( * * @throw cudf::logic_error if the `decimal_type` is not a fixed-point decimal type. * - * @param input Strings instance for this operation. - * @param decimal_type Fixed-point type (with scale) used only for checking overflow. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param input Strings instance for this operation + * @param decimal_type Fixed-point type (with scale) used only for checking overflow + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr is_fixed_point( strings_column_view const& input, data_type decimal_type = data_type{type_id::DECIMAL64}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp index 38a84fc1548..a35cb68ef4e 100644 --- a/cpp/include/cudf/strings/convert/convert_floats.hpp +++ b/cpp/include/cudf/strings/convert/convert_floats.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,14 +39,16 @@ namespace strings { * * @throw cudf::logic_error if output_type is not float type. * - * @param strings Strings instance for this operation. - * @param output_type Type of float numeric column to return. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with floats converted from strings. + * @param strings Strings instance for this operation + * @param output_type Type of float numeric column to return + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with floats converted from strings */ std::unique_ptr to_floats( strings_column_view const& strings, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -62,12 +64,14 @@ std::unique_ptr to_floats( * * @throw cudf::logic_error if floats column is not float type. * - * @param floats Numeric column to convert. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with floats as strings. + * @param floats Numeric column to convert + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with floats as strings */ std::unique_ptr from_floats( column_view const& floats, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -86,12 +90,14 @@ std::unique_ptr from_floats( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr is_float( - strings_column_view const& strings, + strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp index 44213b84139..74ec5d315a2 100644 --- a/cpp/include/cudf/strings/convert/convert_integers.hpp +++ b/cpp/include/cudf/strings/convert/convert_integers.hpp @@ -46,14 +46,16 @@ namespace strings { * * @throw cudf::logic_error if output_type is not integral type. * - * @param strings Strings instance for this operation. - * @param output_type Type of integer numeric column to return. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with integers converted from strings. + * @param input Strings instance for this operation + * @param output_type Type of integer numeric column to return + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with integers converted from strings */ std::unique_ptr to_integers( - strings_column_view const& strings, + strings_column_view const& input, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -67,12 +69,14 @@ std::unique_ptr to_integers( * * @throw cudf::logic_error if integers column is not integral type. * - * @param integers Numeric column to convert. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with integers as strings. + * @param integers Numeric column to convert + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with integers as strings */ std::unique_ptr from_integers( column_view const& integers, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -94,12 +98,14 @@ std::unique_ptr from_integers( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr is_integer( - strings_column_view const& strings, + strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -124,14 +130,16 @@ std::unique_ptr is_integer( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param int_type Integer type used for checking underflow and overflow. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param input Strings instance for this operation + * @param int_type Integer type used for checking underflow and overflow + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr is_integer( - strings_column_view const& strings, + strings_column_view const& input, data_type int_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -152,14 +160,16 @@ std::unique_ptr is_integer( * * @throw cudf::logic_error if output_type is not integral type. * - * @param strings Strings instance for this operation. - * @param output_type Type of integer numeric column to return. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with integers converted from strings. + * @param input Strings instance for this operation + * @param output_type Type of integer numeric column to return + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with integers converted from strings */ std::unique_ptr hex_to_integers( - strings_column_view const& strings, + strings_column_view const& input, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -179,12 +189,14 @@ std::unique_ptr hex_to_integers( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr is_hex( - strings_column_view const& strings, + strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -199,23 +211,25 @@ std::unique_ptr is_hex( * * @code{.pseudo} * Example: - * input = [123, -1, 0, 27, 342718233] // int32 type input column + * input = [1234, -1, 0, 27, 342718233] // int32 type input column * s = integers_to_hex(input) * s is [ '04D2', 'FFFFFFFF', '00', '1B', '146D7719'] * @endcode * * The example above shows an `INT32` type column where each integer is 4 bytes. * Leading zeros are suppressed unless filling out a complete byte as in - * `123 -> '04D2'` instead of `000004D2` or `4D2`. + * `1234 -> '04D2'` instead of `000004D2` or `4D2`. * * @throw cudf::logic_error if the input column is not integral type. * - * @param input Integer column to convert to hex. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with hexadecimal characters. + * @param input Integer column to convert to hex + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with hexadecimal characters */ std::unique_ptr integers_to_hex( column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp index 22272af74fc..25ad7b86748 100644 --- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp +++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,12 +48,14 @@ namespace strings { * * Any null entries will result in corresponding null entries in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New INT64 column converted from strings. + * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New INT64 column converted from strings */ std::unique_ptr ipv4_to_integers( - strings_column_view const& strings, + strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -71,12 +73,14 @@ std::unique_ptr ipv4_to_integers( * * @throw cudf::logic_error if the input column is not INT64 type. * - * @param integers Integer (INT64) column to convert. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param integers Integer (INT64) column to convert + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr integers_to_ipv4( column_view const& integers, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -96,12 +100,14 @@ std::unique_ptr integers_to_ipv4( * * Any null row results in a null entry for that row in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of boolean results for each string. + * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string */ std::unique_ptr is_ipv4( - strings_column_view const& strings, + strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp index 7ab1bf47b0a..dedf4e95138 100644 --- a/cpp/include/cudf/strings/convert/convert_lists.hpp +++ b/cpp/include/cudf/strings/convert/convert_lists.hpp @@ -50,17 +50,19 @@ namespace strings { * * @throw cudf::logic_error if the input column is not a LIST type with a STRING child. * - * @param input Lists column to format. - * @param na_rep Replacement string for null elements. - * @param separators Strings to use for enclosing list components and separating elements. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param input Lists column to format + * @param na_rep Replacement string for null elements + * @param separators Strings to use for enclosing list components and separating elements + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr format_list_column( lists_column_view const& input, - string_scalar const& na_rep = string_scalar("NULL"), + string_scalar const& na_rep = string_scalar(""), strings_column_view const& separators = strings_column_view(column_view{ data_type{type_id::STRING}, 0, nullptr, nullptr, 0}), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp index 7f29a0d2149..902835081af 100644 --- a/cpp/include/cudf/strings/convert/convert_urls.hpp +++ b/cpp/include/cudf/strings/convert/convert_urls.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,12 +39,14 @@ namespace strings { * * Any null entries will result in corresponding null entries in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr url_encode( - strings_column_view const& strings, + strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -60,12 +62,14 @@ std::unique_ptr url_encode( * * Any null entries will result in corresponding null entries in the output column. * - * @param strings Strings instance for this operation. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param input Strings instance for this operation + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr url_decode( - strings_column_view const& strings, + strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh index 965e89cc862..5f50faa158e 100644 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ b/cpp/include/cudf/strings/detail/merge.cuh @@ -18,8 +18,8 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp new file mode 100644 index 00000000000..611e32e28cd --- /dev/null +++ b/cpp/include/cudf/strings/detail/scan.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { +/** + * @brief Scan function for strings + * + * Called by cudf::scan() with only min and max aggregates. + * + * @tparam Op Either DeviceMin or DeviceMax operations + * + * @param input Input strings column + * @param mask Mask for scan + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ +template +std::unique_ptr scan_inclusive(column_view const& input, + bitmask_type const* mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index 586cb1f3f26..a4db1ac46da 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -53,14 +53,16 @@ struct regex_program; * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * @return Columns of strings extracted from the input column */ std::unique_ptr
extract( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -87,14 +89,16 @@ std::unique_ptr
extract( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate any returned device memory * @return Lists column containing strings extracted from the input column */ std::unique_ptr extract_all_record( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp index 7699159fbea..f0cb351eeda 100644 --- a/cpp/include/cudf/strings/padding.hpp +++ b/cpp/include/cudf/strings/padding.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,6 +51,7 @@ namespace strings { * Default is pad right (left justify) * @param fill_char Single UTF-8 character to use for padding; * Default is the space character + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with padded strings */ @@ -59,6 +60,7 @@ std::unique_ptr pad( size_type width, side_type side = side_type::RIGHT, std::string_view fill_char = " ", + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -79,14 +81,16 @@ std::unique_ptr pad( * r is now ['001234','-09876','+00.34','-342567', '0002+2'] * @endcode * - * @param input Strings instance for this operation. - * @param width The minimum number of characters for each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of strings. + * @param input Strings instance for this operation + * @param width The minimum number of characters for each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of strings */ std::unique_ptr zfill( strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp index 2b6575f80d0..7dc9c33f579 100644 --- a/cpp/include/cudf/strings/repeat_strings.hpp +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -52,12 +52,14 @@ namespace strings { * * @param input The scalar containing the string to repeat * @param repeat_times The number of times the input string is repeated + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned string scalar * @return New string scalar in which the input string is repeated */ std::unique_ptr repeat_string( string_scalar const& input, size_type repeat_times, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -81,12 +83,14 @@ std::unique_ptr repeat_string( * * @param input The column containing strings to repeat * @param repeat_times The number of times each input string is repeated + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned strings column * @return New column containing the repeated strings */ std::unique_ptr repeat_strings( strings_column_view const& input, size_type repeat_times, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -115,13 +119,15 @@ std::unique_ptr repeat_strings( * * @param input The column containing strings to repeat * @param repeat_times The column containing numbers of times that the corresponding input strings - * are repeated + * for each row are repeated + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned strings column * @return New column containing the repeated strings. */ std::unique_ptr repeat_strings( strings_column_view const& input, column_view const& repeat_times, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp index 22818f7542e..2476a41e886 100644 --- a/cpp/include/cudf/strings/replace.hpp +++ b/cpp/include/cudf/strings/replace.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,19 +54,21 @@ namespace strings { * * @throw cudf::logic_error if target is an empty string. * - * @param strings Strings column for this operation. - * @param target String to search for within each string. - * @param repl Replacement string if target is found. + * @param input Strings column for this operation + * @param target String to search for within each string + * @param repl Replacement string if target is found * @param maxrepl Maximum times to replace if target appears multiple times in the input string. * Default of -1 specifies replace all occurrences of target in each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr replace( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& target, string_scalar const& repl, - int32_t maxrepl = -1, + cudf::size_type maxrepl = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -92,21 +94,23 @@ std::unique_ptr replace( * * @throw cudf::logic_error if start is greater than stop. * - * @param strings Strings column for this operation. + * @param input Strings column for this operation. * @param repl Replacement string for specified positions found. * Default is empty string. * @param start Start position where repl will be added. * Default is 0, first character position. * @param stop End position (exclusive) to use for replacement. * Default of -1 specifies the end of each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr replace_slice( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& repl = string_scalar(""), size_type start = 0, size_type stop = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -141,16 +145,18 @@ std::unique_ptr replace_slice( * if repls is a single string. * @throw cudf::logic_error if targets or repls contain null entries. * - * @param strings Strings column for this operation. - * @param targets Strings to search for in each string. - * @param repls Corresponding replacement strings for target strings. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param input Strings column for this operation + * @param targets Strings to search for in each string + * @param repls Corresponding replacement strings for target strings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr replace( - strings_column_view const& strings, + strings_column_view const& input, strings_column_view const& targets, strings_column_view const& repls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index bc6659835c3..77db2882253 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -43,20 +43,22 @@ struct regex_program; * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance * @param replacement The string used to replace the matched sequence in each string. * Default is an empty string. * @param max_replace_count The maximum number of times to replace the matched pattern * within each string. Default replaces every substring that is matched. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New strings column */ std::unique_ptr replace_re( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, string_scalar const& replacement = string_scalar(""), std::optional max_replace_count = std::nullopt, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -67,18 +69,20 @@ std::unique_ptr replace_re( * * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. * - * @param strings Strings instance for this operation. - * @param patterns The regular expression patterns to search within each string. - * @param replacements The strings used for replacement. - * @param flags Regex flags for interpreting special characters in the patterns. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column. + * @param input Strings instance for this operation + * @param patterns The regular expression patterns to search within each string + * @param replacements The strings used for replacement + * @param flags Regex flags for interpreting special characters in the patterns + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column */ std::unique_ptr replace_re( - strings_column_view const& strings, + strings_column_view const& input, std::vector const& patterns, strings_column_view const& replacements, regex_flags const flags = regex_flags::DEFAULT, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -92,16 +96,18 @@ std::unique_ptr replace_re( * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also * if the index exceeds the group count specified in the pattern * - * @param strings Strings instance for this operation + * @param input Strings instance for this operation * @param prog Regex program instance * @param replacement The replacement template for creating the output string + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New strings column */ std::unique_ptr replace_with_backrefs( - strings_column_view const& strings, + strings_column_view const& input, regex_program const& prog, std::string_view replacement, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace strings diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp index 26fb36a540e..4fc8fbf67c2 100644 --- a/cpp/include/cudf/strings/reverse.hpp +++ b/cpp/include/cudf/strings/reverse.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -42,10 +42,12 @@ namespace strings { * * @param input Strings column for this operation * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches * @return New strings column */ std::unique_ptr reverse( strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp index 5f2c71725eb..f106663be9b 100644 --- a/cpp/include/cudf/strings/slice.hpp +++ b/cpp/include/cudf/strings/slice.hpp @@ -50,18 +50,20 @@ namespace strings { * r2 is now ["lo","ob"] * @endcode * - * @param strings Strings column for this operation. - * @param start First character position to begin the substring. - * @param stop Last character position (exclusive) to end the substring. - * @param step Distance between input characters retrieved. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with sorted elements of this instance. + * @param input Strings column for this operation + * @param start First character position to begin the substring + * @param stop Last character position (exclusive) to end the substring + * @param step Distance between input characters retrieved + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with sorted elements of this instance */ std::unique_ptr slice_strings( - strings_column_view const& strings, + strings_column_view const& input, numeric_scalar const& start = numeric_scalar(0, false), numeric_scalar const& stop = numeric_scalar(0, false), numeric_scalar const& step = numeric_scalar(1), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -95,16 +97,18 @@ std::unique_ptr slice_strings( * @throw cudf::logic_error if starts and stops are not same integer type. * @throw cudf::logic_error if starts or stops contains nulls. * - * @param strings Strings column for this operation. - * @param starts First character positions to begin the substring. - * @param stops Last character (exclusive) positions to end the substring. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with sorted elements of this instance. + * @param input Strings column for this operation + * @param starts First character positions to begin the substring + * @param stops Last character (exclusive) positions to end the substring + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with sorted elements of this instance */ std::unique_ptr slice_strings( - strings_column_view const& strings, + strings_column_view const& input, column_view const& starts, column_view const& stops, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp index 52ffb735eb7..25eedf1e86b 100644 --- a/cpp/include/cudf/strings/split/partition.hpp +++ b/cpp/include/cudf/strings/split/partition.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,15 +51,17 @@ namespace strings { * r[2] is ["cd","g_h"] * @endcode * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param delimiter UTF-8 encoded string indicating where to split each string. * Default of empty string indicates split on whitespace. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return New table of strings columns. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory + * @return New table of strings columns */ std::unique_ptr
partition( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& delimiter = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -83,15 +85,17 @@ std::unique_ptr
partition( * r[2] is ["cd","h"] * @endcode * - * @param strings Strings instance for this operation. + * @param input Strings instance for this operation * @param delimiter UTF-8 encoded string indicating where to split each string. * Default of empty string indicates split on whitespace. - * @param mr Device memory resource used to allocate the returned table's device memory. - * @return New strings columns. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table's device memory + * @return New strings columns */ std::unique_ptr
rpartition( - strings_column_view const& strings, + strings_column_view const& input, string_scalar const& delimiter = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index 14fcfaecdcd..f1736cb7e0c 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -75,6 +75,7 @@ struct regex_program; * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * @return A table of columns of strings */ @@ -82,6 +83,7 @@ std::unique_ptr
split_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -125,17 +127,19 @@ std::unique_ptr
split_re( * * @throw cudf::logic_error if `pattern` is empty. * - * @param input A column of string elements to be split. + * @param input A column of string elements to be split * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. - * @param mr Device memory resource used to allocate the returned result's device memory. - * @return A table of columns of strings. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned result's device memory + * @return A table of columns of strings */ std::unique_ptr
rsplit_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -185,13 +189,15 @@ std::unique_ptr
rsplit_re( * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory - * @return Lists column of strings. + * @return Lists column of strings */ std::unique_ptr split_record_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -243,6 +249,7 @@ std::unique_ptr split_record_re( * @param prog Regex program instance * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned result's device memory * @return Lists column of strings */ @@ -250,6 +257,7 @@ std::unique_ptr rsplit_record_re( strings_column_view const& input, regex_program const& prog, size_type maxsplit = -1, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp index adf3b291144..556d6805ac3 100644 --- a/cpp/include/cudf/strings/strip.hpp +++ b/cpp/include/cudf/strings/strip.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,6 +57,7 @@ namespace strings { * string; Default is both * @param to_strip UTF-8 encoded characters to strip from each string; * Default is empty string which indicates strip whitespace characters + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ @@ -64,6 +65,7 @@ std::unique_ptr strip( strings_column_view const& input, side_type side = side_type::BOTH, string_scalar const& to_strip = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp index 0cbf6b22029..4bd09352b09 100644 --- a/cpp/include/cudf/strings/translate.hpp +++ b/cpp/include/cudf/strings/translate.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,14 +47,16 @@ namespace strings { * r is now ["AA", "", "cccc", "AcQ"] * @endcode * - * @param strings Strings instance for this operation. - * @param chars_table Table of UTF-8 character mappings. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with padded strings. + * @param input Strings instance for this operation + * @param chars_table Table of UTF-8 character mappings + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with padded strings */ std::unique_ptr translate( - strings_column_view const& strings, + strings_column_view const& input, std::vector> const& chars_table, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -87,19 +89,21 @@ enum class filter_type : bool { * * @throw cudf::logic_error if `replacement` is invalid * - * @param strings Strings instance for this operation. - * @param characters_to_filter Table of character ranges to filter on. + * @param input Strings instance for this operation + * @param characters_to_filter Table of character ranges to filter on * @param keep_characters If true, the `characters_to_filter` are retained and all other characters - * are removed. - * @param replacement Optional replacement string for each character removed. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with filtered strings. + * are removed + * @param replacement Optional replacement string for each character removed + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with filtered strings */ std::unique_ptr filter_characters( - strings_column_view const& strings, + strings_column_view const& input, std::vector> characters_to_filter, filter_type keep_characters = filter_type::KEEP, string_scalar const& replacement = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp index 8d2d43c7f0f..efdc3e62aff 100644 --- a/cpp/include/cudf/strings/wrap.hpp +++ b/cpp/include/cudf/strings/wrap.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,14 +57,16 @@ namespace strings { * wrapped_string_tbl = ["the quick\nbrown fox\njumped over\nthe lazy\nbrown dog", "hello, world"] * ``` * - * @param[in] strings String column. - * @param[in] width Maximum character width of a line within each string. - * @param[in] mr Device memory resource used to allocate the returned column's device memory - * @return Column of wrapped strings. + * @param input String column + * @param width Maximum character width of a line within each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Column of wrapped strings */ std::unique_ptr wrap( - strings_column_view const& strings, + strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/structs/detail/scan.hpp similarity index 55% rename from cpp/include/cudf/strings/detail/json.hpp rename to cpp/include/cudf/structs/detail/scan.hpp index 0fb06d36570..531e0a6c65f 100644 --- a/cpp/include/cudf/strings/detail/json.hpp +++ b/cpp/include/cudf/structs/detail/scan.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,31 +13,33 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once -#include -#include -#include +#include #include #include namespace cudf { -namespace strings { +namespace structs { namespace detail { - /** - * @copydoc cudf::strings::get_json_object + * @brief Scan function for struct column type + * + * Called by cudf::scan() with only min and max aggregates. + * + * @tparam Op Either DeviceMin or DeviceMax operations * + * @param input Input column * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New struct column */ -std::unique_ptr get_json_object(cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, - cudf::strings::get_json_object_options options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +template +std::unique_ptr scan_inclusive(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail -} // namespace strings +} // namespace structs } // namespace cudf diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 6b024d902a9..6946ccdb213 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -52,6 +52,7 @@ #include #include #include +#include #include namespace cudf { @@ -264,6 +265,7 @@ template class device_row_comparator { + public: friend class self_comparator; ///< Allow self_comparator to access private members friend class two_table_comparator; ///< Allow two_table_comparator to access private members @@ -274,6 +276,8 @@ class device_row_comparator { * @param check_nulls Indicates if any input column contains nulls. * @param lhs The first table * @param rhs The second table (may be the same table as `lhs`) + * @param l_dremel_device_views lhs table dremel device view for list type + * @param r_dremel_device_views rhs table dremel device view for list type * @param depth Optional, device array the same length as a row that contains starting depths of * columns if they're nested, and 0 otherwise. * @param column_order Optional, device array the same length as a row that indicates the desired @@ -305,6 +309,44 @@ class device_row_comparator { { } + /** + * @brief Construct a function object for performing a lexicographic + * comparison between the rows of two tables. + * This is a special overload to allow device-side construction of the + * comparator for cases where no preprocessing is needed, i.e. tables with + * non-nested type columns. + * + * @param check_nulls Indicates if any input column contains nulls. + * @param lhs The first table + * @param rhs The second table (may be the same table as `lhs`) + * @param column_order Optional, device array the same length as a row that indicates the desired + * ascending/descending order of each column in a row. If `nullopt`, it is assumed all columns are + * sorted in ascending order. + * @param null_precedence Optional, device array the same length as a row and indicates how null + * values compare to all other for every column. If `nullopt`, then null precedence would be + * `null_order::BEFORE` for all columns. + * @param comparator Physical element relational comparison functor. + */ + template + __device__ device_row_comparator( + Nullate check_nulls, + table_device_view lhs, + table_device_view rhs, + std::optional> column_order = std::nullopt, + std::optional> null_precedence = std::nullopt, + PhysicalElementComparator comparator = {}) noexcept + : _lhs{lhs}, + _rhs{rhs}, + _l_dremel{}, + _r_dremel{}, + _check_nulls{check_nulls}, + _depth{}, + _column_order{column_order}, + _null_precedence{null_precedence}, + _comparator{comparator} + { + } + /** * @brief Performs a relational comparison between two elements in two columns. */ @@ -323,6 +365,8 @@ class device_row_comparator { * @param depth The depth of the column if part of a nested column @see * preprocessed_table::depths * @param comparator Physical element relational comparison functor. + * @param l_dremel_device_view <> + * @param r_dremel_device_view <> */ __device__ element_comparator(Nullate check_nulls, column_device_view lhs, @@ -370,6 +414,13 @@ class device_row_comparator { std::numeric_limits::max()); } + /** + * @brief Throws run-time error when columns types cannot be compared + * or if this class is instantiated with `has_nested_columns = false` but + * passed tables with nested columns + * + * @return Ordering + */ template () and (not has_nested_columns or not cudf::is_nested()))> @@ -379,6 +430,14 @@ class device_row_comparator { CUDF_UNREACHABLE("Attempted to compare elements of uncomparable types."); } + /** + * @brief Compares two struct-type columns + * + * @param lhs_element_index The index of the first element + * @param rhs_element_index The index of the second element + * @return Indicates the relationship between the elements in the `lhs` and `rhs` columns, along + * with the depth at which a null value was encountered. + */ template )> __device__ cuda::std::pair operator()( @@ -413,6 +472,14 @@ class device_row_comparator { rhs_element_index); } + /** + * @brief Compares two list-type columns + * + * @param lhs_element_index The index of the first element + * @param rhs_element_index The index of the second element + * @return Indicates the relationship between the elements in the `lhs` and `rhs` columns, along + * with the depth at which a null value was encountered. + */ template )> __device__ cuda::std::pair operator()(size_type lhs_element_index, diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index b90b2dac012..5d9c930d137 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -336,6 +336,23 @@ inline bool has_nested_nulls(table_view const& input) }); } +/** + * @brief Returns True if the table has a nullable column at any level of the column hierarchy + * + * @param input The table to check for nullable columns + * @return True if the table has nullable columns at any level of the column hierarchy, false + * otherwise + */ +inline bool has_nested_nullable_columns(table_view const& input) +{ + return std::any_of(input.begin(), input.end(), [](auto const& col) { + return col.nullable() || + std::any_of(col.child_begin(), col.child_end(), [](auto const& child_col) { + return has_nested_nullable_columns(table_view{{child_col}}); + }); + }); +} + /** * @brief The function to collect all nullable columns at all nested levels in a given table. * diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index 1130c41afe5..64e802d88dd 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -65,6 +66,7 @@ enum class unary_operator : int32_t { * * @param input A `column_view` as input * @param op operation to perform + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns Column of same size as `input` containing result of the operation @@ -72,6 +74,7 @@ enum class unary_operator : int32_t { std::unique_ptr unary_operation( cudf::column_view const& input, cudf::unary_operator op, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -79,6 +82,7 @@ std::unique_ptr unary_operation( * indicates the value is null and `false` indicates the value is valid. * * @param input A `column_view` as input + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns A non-nullable column of `type_id::BOOL8` elements with `true` @@ -86,6 +90,7 @@ std::unique_ptr unary_operation( */ std::unique_ptr is_null( cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -93,6 +98,7 @@ std::unique_ptr is_null( * indicates the value is valid and `false` indicates the value is null. * * @param input A `column_view` as input + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns A non-nullable column of `type_id::BOOL8` elements with `false` @@ -100,6 +106,7 @@ std::unique_ptr is_null( */ std::unique_ptr is_valid( cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -109,6 +116,7 @@ std::unique_ptr is_valid( * * @param input Input column * @param out_type Desired datatype of output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns Column of same size as `input` containing result of the cast operation @@ -117,6 +125,7 @@ std::unique_ptr is_valid( std::unique_ptr cast( column_view const& input, data_type out_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -127,12 +136,14 @@ std::unique_ptr cast( * @throws cudf::logic_error if `input` is a non-floating point type * * @param input A column of floating-point elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns A non-nullable column of `type_id::BOOL8` elements with `true` representing `NAN` values */ std::unique_ptr is_nan( cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -143,6 +154,7 @@ std::unique_ptr is_nan( * @throws cudf::logic_error if `input` is a non-floating point type * * @param input A column of floating-point elements + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * * @returns A non-nullable column of `type_id::BOOL8` elements with `false` representing `NAN` @@ -150,6 +162,7 @@ std::unique_ptr is_nan( */ std::unique_ptr is_not_nan( cudf::column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index 51f5d9d571a..2dda0740b96 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -279,6 +279,30 @@ constexpr inline bool is_integral() */ bool is_integral(data_type type); +/** + * @brief Indicates whether the type `T` is an integral type but not bool type. + * + * @tparam T The type to verify + * @return true `T` is integral but not bool + * @return false `T` is not integral or is bool + */ +template +constexpr inline bool is_integral_not_bool() +{ + return cuda::std::is_integral_v and not std::is_same_v; +} + +/** + * @brief Indicates whether `type` is a integral `data_type` and not BOOL8 + * + * "Integral" types are fundamental integer types such as `INT*` and `UINT*`. + * + * @param type The `data_type` to verify + * @return true `type` is integral but not bool + * @return false `type` is integral or is bool + */ +bool is_integral_not_bool(data_type type); + /** * @brief Indicates whether the type `T` is a floating point type. * diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index 059bd10eae1..f6872fcdd6d 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -140,39 +140,6 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt */ void expect_column_empty(cudf::column_view const& col); -/** - * @brief Formats a column view as a string - * - * @param col The column view - * @param delimiter The delimiter to put between strings - */ -std::string to_string(cudf::column_view const& col, std::string const& delimiter); - -/** - * @brief Formats a null mask as a string - * - * @param null_mask The null mask buffer - * @param null_mask_size Size of the null mask (in rows) - */ -std::string to_string(std::vector const& null_mask, size_type null_mask_size); - -/** - * @brief Convert column values to a host vector of strings - * - * @param col The column view - */ -std::vector to_strings(cudf::column_view const& col); - -/** - * @brief Print a column view to an ostream - * - * @param os The output stream - * @param col The column view - */ -void print(cudf::column_view const& col, - std::ostream& os = std::cout, - std::string const& delimiter = ","); - /** * @brief Copy the null bitmask from a column view to a host vector * diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index c0932b81dc3..b9f2e0d9868 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -802,7 +803,8 @@ class strings_column_wrapper : public detail::column_wrapper { offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_bitmask = cudf::detail::make_device_uvector_sync( null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); - wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count); + wrapped = cudf::make_strings_column( + d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream()); } /** @@ -1281,6 +1283,11 @@ class dictionary_column_wrapper : public detail::column_wrapper { template class lists_column_wrapper : public detail::column_wrapper { public: + /** + * @brief Cast to lists_column_view + */ + operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; } + /** * @brief Construct a lists column containing a single list of fixed-width * type from an initializer list of values. @@ -1542,8 +1549,12 @@ class lists_column_wrapper : public detail::column_wrapper { rmm::device_buffer&& null_mask) { // construct the list column - wrapped = make_lists_column( - num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask)); + wrapped = make_lists_column(num_rows, + std::move(offsets), + std::move(values), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1618,8 +1629,12 @@ class lists_column_wrapper : public detail::column_wrapper { }(); // construct the list column - wrapped = make_lists_column( - cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask)); + wrapped = make_lists_column(cols.size(), + std::move(offsets), + std::move(data), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1647,8 +1662,12 @@ class lists_column_wrapper : public detail::column_wrapper { depth = 0; size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1; - wrapped = - make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{}); + wrapped = make_lists_column(num_elements, + std::move(offsets), + std::move(c), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); } /** @@ -1697,12 +1716,15 @@ class lists_column_wrapper : public detail::column_wrapper { } lists_column_view lcv(col); - return make_lists_column(col.size(), - std::make_unique(lcv.offsets()), - normalize_column(lists_column_view(col).child(), - lists_column_view(expected_hierarchy).child()), - col.null_count(), - copy_bitmask(col)); + return make_lists_column( + col.size(), + std::make_unique(lcv.offsets()), + normalize_column(lists_column_view(col).child(), + lists_column_view(expected_hierarchy).child()), + col.null_count(), + cudf::detail::copy_bitmask( + col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + cudf::test::get_default_stream()); } std::pair, std::vector>> preprocess_columns( @@ -1825,7 +1847,8 @@ class structs_column_wrapper : public detail::column_wrapper { child_column_wrappers.end(), std::back_inserter(child_columns), [&](auto const& column_wrapper) { - return std::make_unique(column_wrapper.get()); + return std::make_unique(column_wrapper.get(), + cudf::test::get_default_stream()); }); init(std::move(child_columns), validity); } @@ -1861,7 +1884,8 @@ class structs_column_wrapper : public detail::column_wrapper { child_column_wrappers.end(), std::back_inserter(child_columns), [&](auto const& column_wrapper) { - return std::make_unique(column_wrapper.get()); + return std::make_unique(column_wrapper.get(), + cudf::test::get_default_stream()); }); init(std::move(child_columns), validity_iter); } @@ -1885,8 +1909,11 @@ class structs_column_wrapper : public detail::column_wrapper { return cudf::test::detail::make_null_mask(validity.begin(), validity.end()); }(); - wrapped = cudf::make_structs_column( - num_rows, std::move(child_columns), null_count, std::move(null_mask)); + wrapped = cudf::make_structs_column(num_rows, + std::move(child_columns), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } template diff --git a/cpp/include/cudf_test/debug_utilities.hpp b/cpp/include/cudf_test/debug_utilities.hpp new file mode 100644 index 00000000000..a0881490b82 --- /dev/null +++ b/cpp/include/cudf_test/debug_utilities.hpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf::test { + +/** + * @brief Formats a column view as a string + * + * @param col The input column view + * @param delimiter The delimiter to put between strings + */ +std::string to_string(cudf::column_view const& col, std::string const& delimiter); + +/** + * @brief Convert column values to a host vector of strings + * + * @param col The input column view + */ +std::vector to_strings(cudf::column_view const& col); + +/** + * @brief Print a column view to an ostream + * + * @param col The input column view + * @param os The output stream + */ +void print(cudf::column_view const& col, std::ostream& os = std::cout); + +} // namespace cudf::test diff --git a/cpp/include/cudf_test/detail/column_utilities.hpp b/cpp/include/cudf_test/detail/column_utilities.hpp deleted file mode 100644 index f8270f61f10..00000000000 --- a/cpp/include/cudf_test/detail/column_utilities.hpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -namespace cudf { -namespace test { -namespace detail { - -/** - * @brief Formats a column view as a string - * - * @param col The column view - * @param delimiter The delimiter to put between strings - * @param indent Indentation for all output. See detail::to_strings for detailed - * explanation. - */ -std::string to_string(cudf::column_view const& col, - std::string const& delimiter, - std::string const& indent = ""); - -/** - * @brief Formats a null mask as a string - * - * @param null_mask The null mask buffer - * @param null_mask_size Size of the null mask (in rows) - * @param indent Indentation for all output. See detail::to_strings for detailed - * explanation. - */ -std::string to_string(std::vector const& null_mask, - size_type null_mask_size, - std::string const& indent = ""); - -/** - * @brief Convert column values to a host vector of strings - * - * Supports indentation of all output. For example, if the displayed output of your column - * would be - * - * @code{.pseudo} - * "1,2,3,4,5" - * @endcode - * and the `indent` parameter was " ", that indentation would be prepended to - * result in the output - * @code{.pseudo} - * " 1,2,3,4,5" - * @endcode - * - * The can be useful for displaying complex types. An example use case would be for - * displaying the nesting of a LIST type column (via recursion). - * - * List>: - * Length : 3 - * Offsets : 0, 2, 5, 6 - * Children : - * List: - * Length : 6 - * Offsets : 0, 2, 4, 7, 8, 9, 11 - * Children : - * 1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10 - * - * @param col The column view - * @param indent Indentation for all output - */ -std::vector to_strings(cudf::column_view const& col, std::string const& indent = ""); - -} // namespace detail -} // namespace test -} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 4da2807bbe6..8845b84613d 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -130,7 +130,6 @@ * @defgroup strings_replace Replacing * @defgroup strings_split Splitting * @defgroup strings_extract Extracting - * @defgroup strings_json JSON * @defgroup strings_regex Regex * @} * @defgroup dictionary_apis Dictionary @@ -146,6 +145,10 @@ * @defgroup io_datasources Data Sources * @defgroup io_datasinks Data Sinks * @} + * @defgroup json_apis JSON + * @{ + * @defgroup json_object JSON Path + * @} * @defgroup lists_apis Lists * @{ * @defgroup lists_combine Combining diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp similarity index 72% rename from cpp/include/nvtext/bpe_tokenize.hpp rename to cpp/include/nvtext/byte_pair_encoding.hpp index c67f4bd8b1c..632a3cc279f 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/byte_pair_encoding.hpp @@ -32,11 +32,11 @@ namespace nvtext { /** * @brief The table of merge pairs for the BPE encoder. * - * To create an instance, call @ref nvtext::load_merge_pairs_file + * To create an instance, call @ref nvtext::load_merge_pairs */ struct bpe_merge_pairs { struct bpe_merge_pairs_impl; - std::unique_ptr impl{}; ///< Implementation of the BPE merge pairs table. + bpe_merge_pairs_impl* impl{}; ///< Implementation of the BPE merge pairs table. /** * @brief Construct a new bpe merge pairs object @@ -61,11 +61,14 @@ struct bpe_merge_pairs { rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); ~bpe_merge_pairs(); + bpe_merge_pairs(); }; /** * @brief Create a nvtext::bpe_merge_pairs from an input file. * + * @deprecated Since 23.12 + * * The file should contain a pair of strings per line separated by * a single space. * @@ -94,10 +97,40 @@ struct bpe_merge_pairs { * @param mr Memory resource to allocate any returned objects. * @return A nvtext::bpe_merge_pairs object */ -std::unique_ptr load_merge_pairs_file( +[[deprecated]] std::unique_ptr load_merge_pairs_file( std::string const& filename_merges, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a nvtext::bpe_merge_pairs from a strings column + * + * The input column should contain a unique pair of strings per line separated by + * a single space. An incorrect format or non-unique entries will result in + * undefined behavior. + * + * Example: + * @code{.pseudo} + * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] + * mps = load_merge_pairs(merge_pairs) + * // the mps object can be passed to the byte_pair_encoding API + * @endcode + * + * The pairs are expected to be ordered in the file by their rank + * relative to each other. A pair earlier in the file has priority over + * any pairs below it. + * + * @throw cudf::logic_error if `merge_pairs` is empty or contains nulls + * + * @param merge_pairs Column containing the unique merge pairs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return A nvtext::bpe_merge_pairs object + */ +std::unique_ptr load_merge_pairs( + cudf::strings_column_view const& merge_pairs, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Byte pair encode the input strings. * @@ -110,7 +143,8 @@ std::unique_ptr load_merge_pairs_file( * pairs before the result is joined to make the output string. * * @code{.pseudo} - * mps = load_merges_file("merges.txt") // see doxygen for example contents + * merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"] + * mps = load_merge_pairs(merge_pairs) * input = ["test sentence", "thisis test"] * result = byte_pair_encoding(input, mps) * result is now ["test sent ence", "this is test"] @@ -120,7 +154,7 @@ std::unique_ptr load_merge_pairs_file( * @throw cudf::logic_error if `separator` is invalid * * @param input Strings to encode. - * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file. + * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs. * @param separator String used to build the output after encoding. * Default is a space. * @param mr Memory resource to allocate any returned objects. diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 1be25b4f1f4..3cbff5c744b 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,12 +44,14 @@ namespace nvtext { * A null input element at row `i` produces a corresponding null entry * for row `i` in the output column. * - * @param strings Strings column to normalize. - * @param mr Device memory resource used to allocate the returned column's device memory. + * @param input Strings column to normalize + * @param mr Device memory resource used to allocate the returned column's device memory + * @param stream CUDA stream used for device memory operations and kernel launches * @return New strings columns of normalized strings. */ std::unique_ptr normalize_spaces( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -89,16 +91,18 @@ std::unique_ptr normalize_spaces( * This function requires about 16x the number of character bytes in the input * strings column as working memory. * - * @param strings The input strings to normalize. + * @param input The input strings to normalize * @param do_lower_case If true, upper-case characters are converted to * lower-case and accents are stripped from those characters. * If false, accented and upper-case characters are not transformed. - * @param mr Memory resource to allocate any returned objects. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects * @return Normalized strings column */ std::unique_ptr normalize_characters( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, bool do_lower_case, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp index 0dde7f195b9..88cf7d41901 100644 --- a/cpp/include/nvtext/replace.hpp +++ b/cpp/include/nvtext/replace.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -73,19 +73,21 @@ namespace nvtext { * @throw cudf::logic_error if targets or replacements contain nulls * @throw cudf::logic_error if delimiter is invalid * - * @param strings Strings column to replace. - * @param targets Strings to compare against tokens found in `strings` + * @param input Strings column to replace + * @param targets Strings to compare against tokens found in `input` * @param replacements Replacement strings for each string in `targets` * @param delimiter Characters used to separate each string into tokens. * The default of empty string will identify tokens using whitespace. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of with replaced strings. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of with replaced strings */ std::unique_ptr replace_tokens( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::strings_column_view const& targets, cudf::strings_column_view const& replacements, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -120,19 +122,21 @@ std::unique_ptr replace_tokens( * * @throw cudf::logic_error if `delimiter` or `replacement` is invalid * - * @param strings Strings column to replace. - * @param min_token_length The minimum number of characters to retain a token in the output string. - * @param replacement Optional replacement string to be used in place of removed tokens. + * @param input Strings column to replace + * @param min_token_length The minimum number of characters to retain a token in the output string + * @param replacement Optional replacement string to be used in place of removed tokens * @param delimiter Characters used to separate each string into tokens. * The default of empty string will identify tokens using whitespace. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of with replaced strings. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of with replaced strings */ std::unique_ptr filter_tokens( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::size_type min_token_length, cudf::string_scalar const& replacement = cudf::string_scalar{""}, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp index 44f8f44557c..107fefcc3bf 100644 --- a/cpp/include/nvtext/tokenize.hpp +++ b/cpp/include/nvtext/tokenize.hpp @@ -49,15 +49,17 @@ namespace nvtext { * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column tokenize. + * @param input Strings column to tokenize * @param delimiter UTF-8 characters used to separate each string into tokens. * The default of empty string will separate tokens using whitespace. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr tokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -84,14 +86,16 @@ std::unique_ptr tokenize( * * @throw cudf::logic_error if the delimiters column is empty or contains nulls. * - * @param strings Strings column to tokenize. - * @param delimiters Strings used to separate individual strings into tokens. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to tokenize + * @param delimiters Strings used to separate individual strings into tokens + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr tokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -112,15 +116,17 @@ std::unique_ptr tokenize( * All null row entries are ignored and the output contains all valid rows. * The number of tokens for a null element is set to 0 in the output column. * - * @param strings Strings column to use for this operation - * @param delimiter Strings used to separate each string into tokens; + * @param input Strings column to count tokens + * @param delimiter Strings used to separate each string into tokens. * The default of empty string will separate tokens using whitespace. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of token counts */ std::unique_ptr count_tokens( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::string_scalar const& delimiter = cudf::string_scalar{""}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -143,14 +149,16 @@ std::unique_ptr count_tokens( * * @throw cudf::logic_error if the delimiters column is empty or contains nulls * - * @param strings Strings column to use for this operation + * @param input Strings column to count tokens * @param delimiters Strings used to separate each string into tokens + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column of token counts */ std::unique_ptr count_tokens( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -168,12 +176,14 @@ std::unique_ptr count_tokens( * * All null row entries are ignored and the output contains all valid rows. * - * @param strings Strings column to tokenize. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to tokenize + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr character_tokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -203,16 +213,18 @@ std::unique_ptr character_tokenize( * @throw cudf::logic_error if `row_indices.size() != strings.size()` * @throw cudf::logic_error if `row_indices` contains nulls * - * @param strings Strings column to detokenize. - * @param row_indices The relative output row index assigned for each token in the input column. - * @param separator String to append after concatenating each token to the proper output row. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings columns of tokens. + * @param input Strings column to detokenize + * @param row_indices The relative output row index assigned for each token in the input column + * @param separator String to append after concatenating each token to the proper output row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings columns of tokens */ std::unique_ptr detokenize( - cudf::strings_column_view const& strings, + cudf::strings_column_view const& input, cudf::column_view const& row_indices, cudf::string_scalar const& separator = cudf::string_scalar(" "), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 33bd04fffb3..4128afa3935 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -21,8 +21,8 @@ include(rapids-export) include(rapids-find) project( - CUDA_KAFKA - VERSION 23.10.00 + CUDF_KAFKA + VERSION 23.12.00 LANGUAGES CXX ) @@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp) # ################################################################################################## # * include paths --------------------------------------------------------------------------------- target_include_directories( - cudf_kafka PUBLIC "$" + cudf_kafka PUBLIC "$" "$" ) @@ -85,6 +85,8 @@ set_target_properties( CXX_STANDARD_REQUIRED ON ) +add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka) + # ################################################################################################## # * cudf_kafka Install ---------------------------------------------------------------------------- rapids_cmake_install_lib_dir(lib_dir) @@ -94,7 +96,7 @@ install( EXPORT cudf_kafka-exports ) -install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) rapids_export( INSTALL cudf_kafka diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake index aa4c5b60e7a..20aa9873f43 100644 --- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake +++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION) endif() endfunction() -set(CUDA_KAFKA_MIN_VERSION_cudf - "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}" +set(CUDF_KAFKA_MIN_VERSION + "${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}" ) -find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf}) +find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION}) if(cudf_REQUIRES_CUDA) - rapids_cuda_init_architectures(CUDA_KAFKA) + rapids_cuda_init_architectures(CUDF_KAFKA) # Since we are building cudf as part of ourselves we need to enable the CUDA language in the # top-most scope enable_language(CUDA) - # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that + # Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that # rapids_cuda_init_architectures relies on `project` calling - if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE) - include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}") + if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE) + include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}") endif() endif() diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt index 68a5327b455..b819cb6fc3b 100644 --- a/cpp/libcudf_kafka/tests/CMakeLists.txt +++ b/cpp/libcudf_kafka/tests/CMakeLists.txt @@ -26,7 +26,7 @@ function(ConfigureTest test_name) add_executable(${test_name} ${ARGN}) set_target_properties( ${test_name} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" ) target_link_libraries( diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index ef07de8c461..53b04c4ca80 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -366,7 +366,7 @@ std::unique_ptr binary_operation(column_view const& lhs, CUDF_EXPECTS((lhs.size() == rhs.size()), "Column sizes don't match"); - auto [new_mask, null_count] = bitmask_and(table_view({lhs, rhs}), stream, mr); + auto [new_mask, null_count] = cudf::detail::bitmask_and(table_view({lhs, rhs}), stream, mr); auto out = make_fixed_width_column(output_type, lhs.size(), std::move(new_mask), null_count, stream, mr); @@ -405,38 +405,42 @@ std::unique_ptr binary_operation(scalar const& lhs, column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, scalar const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, std::string const& ptx, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr); } } // namespace cudf diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 1f7f342632a..85ab5c6d6cb 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -47,14 +47,16 @@ namespace { struct scalar_as_column_view { using return_type = typename std::pair>; template ())> - return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) + return_type operator()(scalar const& s, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource*) { auto& h_scalar_type_view = static_cast&>(const_cast(s)); auto col_v = column_view(s.type(), 1, h_scalar_type_view.data(), reinterpret_cast(s.validity_data()), - !s.is_valid()); + !s.is_valid(stream)); return std::pair{col_v, std::unique_ptr(nullptr)}; } template ())> diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 5a0d3e4f120..1a1cbb17d15 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -157,16 +157,21 @@ void set_null_mask(bitmask_type* bitmask, // Create a device_buffer for a null mask rmm::device_buffer create_null_mask(size_type size, mask_state state, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::create_null_mask(size, state, cudf::get_default_stream(), mr); + return detail::create_null_mask(size, state, stream, mr); } // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, // or null, otherwise; -void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid) +void set_null_mask(bitmask_type* bitmask, + size_type begin_bit, + size_type end_bit, + bool valid, + rmm::cuda_stream_view stream) { - return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream()); + return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, stream); } namespace detail { @@ -505,39 +510,67 @@ std::pair bitmask_or(table_view const& view, return std::pair(std::move(null_mask), 0); } +void set_all_valid_null_masks(column_view const& input, + column& output, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.nullable()) { + auto mask = detail::create_null_mask(output.size(), mask_state::ALL_VALID, stream, mr); + output.set_null_mask(std::move(mask), 0); + + for (size_type i = 0; i < input.num_children(); ++i) { + set_all_valid_null_masks(input.child(i), output.child(i), stream, mr); + } + } +} + } // namespace detail // Create a bitmask from a specific range rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type begin_bit, size_type end_bit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr); } // Create a bitmask from a column view -rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr) +rmm::device_buffer copy_bitmask(column_view const& view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::copy_bitmask(view, stream, mr); } std::pair bitmask_and(table_view const& view, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_and(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::bitmask_and(view, stream, mr); } std::pair bitmask_or(table_view const& view, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_or(view, cudf::get_default_stream(), mr); + CUDF_FUNC_RANGE(); + return detail::bitmask_or(view, stream, mr); } // Count non-zero bits in the specified range -cudf::size_type null_count(bitmask_type const* bitmask, size_type start, size_type stop) +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream) { - return detail::null_count(bitmask, start, stop, cudf::get_default_stream()); + CUDF_FUNC_RANGE(); + return detail::null_count(bitmask, start, stop, stream); } } // namespace cudf diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index d08c3025553..9b9e780965a 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -563,7 +563,7 @@ rmm::device_buffer concatenate_masks(host_span views, }); rmm::device_buffer null_mask = - create_null_mask(total_element_count, mask_state::UNINITIALIZED, mr); + cudf::detail::create_null_mask(total_element_count, mask_state::UNINITIALIZED, stream, mr); detail::concatenate_masks(views, static_cast(null_mask.data()), stream); diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 11c27fc86e3..879ddb5048e 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -268,8 +268,9 @@ struct column_scalar_scatterer_impl { // Compute null mask rmm::device_buffer null_mask = - target.nullable() ? copy_bitmask(target, stream, mr) - : create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr); + target.nullable() + ? detail::copy_bitmask(target, stream, mr) + : detail::create_null_mask(target.size(), mask_state::UNALLOCATED, stream, mr); column null_mask_stub(data_type{type_id::STRUCT}, target.size(), rmm::device_buffer{}, diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 506832881a9..195c8924c9a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -410,7 +410,8 @@ void sparse_to_dense_results(table_view const& keys, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto row_bitmask = bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first; + auto row_bitmask = + cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first; bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; bitmask_type const* row_bitmask_ptr = skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu index a9edcfecbf7..a456d4b5964 100644 --- a/cpp/src/groupby/sort/group_quantiles.cu +++ b/cpp/src/groupby/sort/group_quantiles.cu @@ -49,6 +49,7 @@ struct calculate_quantile_fn { double const* d_quantiles; size_type num_quantiles; interpolation interpolation; + size_type* null_count; __device__ void operator()(size_type i) { @@ -68,11 +69,13 @@ struct calculate_quantile_fn { thrust::for_each_n(thrust::seq, thrust::make_counting_iterator(0), num_quantiles, - [d_result = d_result, segment_size, offset](size_type j) { - if (segment_size == 0) + [d_result = d_result, segment_size, offset, this](size_type j) { + if (segment_size == 0) { d_result.set_null(offset + j); - else + atomicAdd(this->null_count, 1); + } else { d_result.set_valid(offset + j); + } }); } }; @@ -104,6 +107,7 @@ struct quantiles_functor { auto values_view = column_device_view::create(values, stream); auto group_size_view = column_device_view::create(group_sizes, stream); auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); + auto null_count = rmm::device_scalar(0, stream, mr); // For each group, calculate quantile if (!cudf::is_dictionary(values.type())) { @@ -118,7 +122,8 @@ struct quantiles_functor { group_offsets.data(), quantile.data(), static_cast(quantile.size()), - interpolation}); + interpolation, + null_count.data()}); } else { auto values_iter = cudf::dictionary::detail::make_dictionary_iterator(*values_view); thrust::for_each_n(rmm::exec_policy(stream), @@ -131,9 +136,11 @@ struct quantiles_functor { group_offsets.data(), quantile.data(), static_cast(quantile.size()), - interpolation}); + interpolation, + null_count.data()}); } + result->set_null_count(null_count.value(stream)); return result; } diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh deleted file mode 100644 index 183042fc0f4..00000000000 --- a/cpp/src/hash/unordered_multiset.cuh +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include - -namespace cudf { -namespace detail { -/* - * Device view of the unordered multiset - */ -template , - typename Equality = equal_to> -class unordered_multiset_device_view { - public: - unordered_multiset_device_view(size_type hash_size, - size_type const* hash_begin, - Element const* hash_data) - : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals() - { - } - - bool __device__ contains(Element e) const - { - size_type loc = hasher(e) % (2 * hash_size); - - for (size_type i = hash_begin[loc]; i < hash_begin[loc + 1]; ++i) { - if (equals(hash_data[i], e)) return true; - } - - return false; - } - - private: - Hasher hasher; - Equality equals; - size_type hash_size; - size_type const* hash_begin; - Element const* hash_data; -}; - -/* - * Fixed size set on a device. - */ -template , - typename Equality = equal_to> -class unordered_multiset { - public: - /** - * @brief Factory to construct a new unordered_multiset - */ - static unordered_multiset create(column_view const& col, rmm::cuda_stream_view stream) - { - auto d_column = column_device_view::create(col, stream); - auto d_col = *d_column; - - auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async( - 2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource()); - auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async( - 2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource()); - auto hash_data = rmm::device_uvector(d_col.size(), stream); - - Hasher hasher; - size_type* d_hash_bins_start = hash_bins_start.data(); - size_type* d_hash_bins_end = hash_bins_end.data(); - Element* d_hash_data = hash_data.data(); - - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_start, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - cuda::atomic_ref ref{*(d_hash_bins_start + tmp)}; - ref.fetch_add(1, cuda::std::memory_order_relaxed); - } - }); - - thrust::exclusive_scan(rmm::exec_policy(stream), - hash_bins_start.begin(), - hash_bins_start.end(), - hash_bins_end.begin()); - - thrust::copy(rmm::exec_policy(stream), - hash_bins_end.begin(), - hash_bins_end.end(), - hash_bins_start.begin()); - - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - cuda::atomic_ref ref{*(d_hash_bins_end + tmp)}; - size_type offset = ref.fetch_add(1, cuda::std::memory_order_relaxed); - d_hash_data[offset] = e; - } - }); - - return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data)); - } - - unordered_multiset_device_view to_device() const - { - return unordered_multiset_device_view( - size, hash_bins.data(), hash_data.data()); - } - - private: - unordered_multiset(size_type size, - rmm::device_uvector&& hash_bins, - rmm::device_uvector&& hash_data) - : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)} - { - } - - size_type size; - rmm::device_uvector hash_bins; - rmm::device_uvector hash_data; -}; - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 0cd750bc947..3a9fe50d25b 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -197,7 +197,9 @@ std::shared_ptr dispatch_to_arrow::operator()( arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - return unsupported_decimals_to_arrow(input, 9, ar_mr, stream); + using DeviceType = int32_t; + return unsupported_decimals_to_arrow( + input, cudf::detail::max_precision(), ar_mr, stream); } template <> @@ -208,7 +210,9 @@ std::shared_ptr dispatch_to_arrow::operator()( arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - return unsupported_decimals_to_arrow(input, 18, ar_mr, stream); + using DeviceType = int64_t; + return unsupported_decimals_to_arrow( + input, cudf::detail::max_precision(), ar_mr, stream); } template <> @@ -219,7 +223,8 @@ std::shared_ptr dispatch_to_arrow::operator() arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - using DeviceType = __int128_t; + using DeviceType = __int128_t; + auto const max_precision = cudf::detail::max_precision(); rmm::device_uvector buf(input.size(), stream); @@ -234,7 +239,7 @@ std::shared_ptr dispatch_to_arrow::operator() CUDF_CUDA_TRY(cudaMemcpyAsync( data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); - auto type = arrow::decimal(18, -input.type().scale()); + auto type = arrow::decimal(max_precision, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, std::move(data_buffer)}; auto data = std::make_shared(type, input.size(), buffers); @@ -377,10 +382,10 @@ std::shared_ptr dispatch_to_arrow::operator()( { // Arrow dictionary requires indices to be signed integer std::unique_ptr dict_indices = - cast(cudf::dictionary_column_view(input).get_indices_annotated(), - cudf::data_type{type_id::INT32}, - stream, - rmm::mr::get_current_device_resource()); + detail::cast(cudf::dictionary_column_view(input).get_indices_annotated(), + cudf::data_type{type_id::INT32}, + stream, + rmm::mr::get_current_device_resource()); auto indices = dispatch_to_arrow{}.operator()( dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream); auto dict_keys = cudf::dictionary_column_view(input).keys(); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 8c586306ad5..6e9c634804c 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -146,6 +146,12 @@ struct column_to_strings_fn { { } + ~column_to_strings_fn() = default; + column_to_strings_fn(column_to_strings_fn const&) = delete; + column_to_strings_fn& operator=(column_to_strings_fn const&) = delete; + column_to_strings_fn(column_to_strings_fn&&) = delete; + column_to_strings_fn& operator=(column_to_strings_fn&&) = delete; + // Note: `null` replacement with `na_rep` deferred to `concatenate()` // instead of column-wise; might be faster // @@ -160,8 +166,9 @@ struct column_to_strings_fn { std::enable_if_t, std::unique_ptr> operator()( column_view const& column) const { - return cudf::strings::detail::from_booleans( - column, options_.get_true_value(), options_.get_false_value(), stream_, mr_); + string_scalar true_string{options_.get_true_value(), true, stream_}; + string_scalar false_string{options_.get_false_value(), true, stream_}; + return cudf::strings::detail::from_booleans(column, true_string, false_string, stream_, mr_); } // strings: @@ -367,10 +374,10 @@ void write_chunked(data_sink* out_sink, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); - cudf::string_scalar newline{options.get_line_terminator()}; + cudf::string_scalar newline{options.get_line_terminator(), true, stream}; auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, newline, - string_scalar("", false), + string_scalar{"", false, stream}, stream, rmm::mr::get_current_device_resource()); strings_column_view strings_column{p_str_col_w_nl->view()}; @@ -455,12 +462,14 @@ void write_csv(data_sink* out_sink, // populate vector of string-converted columns: // - std::transform(sub_view.begin(), - sub_view.end(), - std::back_inserter(str_column_vec), - [converter](auto const& current_col) { - return cudf::type_dispatcher(current_col.type(), converter, current_col); - }); + std::transform( + sub_view.begin(), + sub_view.end(), + std::back_inserter(str_column_vec), + [&converter = std::as_const(converter)](auto const& current_col) { + return cudf::type_dispatcher( + current_col.type(), converter, current_col); + }); // create string table view from str_column_vec: // @@ -470,18 +479,19 @@ void write_csv(data_sink* out_sink, // concatenate columns in each row into one big string column // (using null representation and delimiter): // - std::string delimiter_str{options.get_inter_column_delimiter()}; auto str_concat_col = [&] { + cudf::string_scalar delimiter_str{ + std::string{options.get_inter_column_delimiter()}, true, stream}; + cudf::string_scalar options_narep{options.get_na_rep(), true, stream}; if (str_table_view.num_columns() > 1) return cudf::strings::detail::concatenate(str_table_view, delimiter_str, - options.get_na_rep(), + options_narep, strings::separator_on_nulls::YES, stream, rmm::mr::get_current_device_resource()); - cudf::string_scalar narep{options.get_na_rep()}; return cudf::strings::detail::replace_nulls( - str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource()); + str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource()); }(); write_chunked(out_sink, str_concat_col->view(), options, stream, mr); diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index c4f99736306..22385d33c7b 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -48,6 +49,14 @@ enum class stack_op_type : int8_t { RESET = 3 ///< Operation popping all items currently on the stack }; +/** + * @brief Describes the kind of stack operations supported by the logical stack. + */ +enum class stack_op_support : bool { + NO_RESET_SUPPORT = false, ///< A stack that only supports push(x) and pop() operations + WITH_RESET_SUPPORT = true ///< A stack that supports push(x), pop(), and reset() operations +}; + namespace detail { /** @@ -130,6 +139,37 @@ struct StackSymbolToStackOp { StackSymbolToStackOpTypeT symbol_to_stack_op_type; }; +/** + * @brief Function object that maps a stack `reset` operation to `1`. + */ +template +struct NewlineToResetStackSegmentOp { + template + constexpr CUDF_HOST_DEVICE uint32_t operator()(StackSymbolT const& stack_symbol) const + { + stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol); + + // Every reset operation marks the beginning of a new segment + return (stack_op == stack_op_type::RESET) ? 1 : 0; + } + + /// Function object returning a stack operation type for a given stack symbol + StackSymbolToStackOpTypeT symbol_to_stack_op_type; +}; + +/** + * @brief Function object that wraps around for values that exceed the largest value of `TargetT` + */ +template +struct ModToTargetTypeOpT { + template + constexpr CUDF_HOST_DEVICE TargetT operator()(T const& val) const + { + return static_cast( + val % (static_cast(cuda::std::numeric_limits::max()) + static_cast(1))); + } +}; + /** * @brief Binary reduction operator to compute the absolute stack level from relative stack levels * (i.e., +1 for a PUSH, -1 for a POP operation). @@ -140,9 +180,7 @@ struct AddStackLevelFromStackOp { constexpr CUDF_HOST_DEVICE StackOp operator()( StackOp const& lhs, StackOp const& rhs) const { - StackLevelT new_level = (symbol_to_stack_op_type(rhs.value) == stack_op_type::RESET) - ? 0 - : (lhs.stack_level + rhs.stack_level); + StackLevelT new_level = lhs.stack_level + rhs.stack_level; return StackOp{new_level, rhs.value}; } @@ -230,6 +268,8 @@ struct RemapEmptyStack { * onto the stack or pop something from the stack and resolves the symbol that is on top of the * stack. * + * @tparam SupportResetOperation Whether the logical stack also supports `reset` operations that + * reset the stack to the empty stack * @tparam StackLevelT Signed integer type that must be sufficient to cover [-max_stack_level, * max_stack_level] for the given sequence of stack operations. Must be signed as it needs to cover * the stack level of any arbitrary subsequence of stack operations. @@ -261,7 +301,8 @@ struct RemapEmptyStack { * what-is-on-top-of-the-stack * @param[in] stream The cuda stream to which to dispatch the work */ -template ; + // Type used to mark *-by-key segments after `reset` operations + using StackSegmentT = uint8_t; + // The unsigned integer type that we use for radix sorting items of type StackOpT using StackOpUnsignedT = detail::UnsignedStackOpType; static_assert(!std::is_void(), "unsupported StackOpT size"); @@ -292,6 +336,8 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, using TransformInputItT = cub::TransformInputIterator; + constexpr bool supports_reset_op = SupportResetOperation == stack_op_support::WITH_RESET_SUPPORT; + auto const num_symbols_in = d_symbol_positions.size(); // Converting a stack symbol that may either push or pop to a stack operation: @@ -330,14 +376,44 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, // Getting temporary storage requirements for the prefix sum of the stack level after each // operation - CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( - nullptr, - stack_level_scan_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{symbol_to_stack_op}, - num_symbols_in, - stream)); + if constexpr (supports_reset_op) { + // Iterator that returns `1` for every symbol that corresponds to a `reset` operation + auto reset_segments_it = thrust::make_transform_iterator( + d_symbols, + detail::NewlineToResetStackSegmentOp{symbol_to_stack_op}); + + auto const fake_key_segment_it = static_cast(nullptr); + std::size_t gen_segments_scan_bytes = 0; + std::size_t scan_by_key_bytes = 0; + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveSum( + nullptr, + gen_segments_scan_bytes, + reset_segments_it, + thrust::make_transform_output_iterator(fake_key_segment_it, + detail::ModToTargetTypeOpT{}), + num_symbols_in, + stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScanByKey( + nullptr, + scan_by_key_bytes, + fake_key_segment_it, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + cub::Equality{}, + stream)); + stack_level_scan_bytes = std::max(gen_segments_scan_bytes, scan_by_key_bytes); + } else { + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( + nullptr, + stack_level_scan_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + stream)); + } // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the // operations) @@ -401,14 +477,41 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; // Compute prefix sum of the stack level after each operation - CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( - temp_storage.data(), - total_temp_storage_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{symbol_to_stack_op}, - num_symbols_in, - stream)); + if constexpr (supports_reset_op) { + // Iterator that returns `1` for every symbol that corresponds to a `reset` operation + auto reset_segments_it = thrust::make_transform_iterator( + d_symbols, + detail::NewlineToResetStackSegmentOp{symbol_to_stack_op}); + + rmm::device_uvector key_segments{num_symbols_in, stream}; + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveSum( + temp_storage.data(), + total_temp_storage_bytes, + reset_segments_it, + thrust::make_transform_output_iterator(key_segments.data(), + detail::ModToTargetTypeOpT{}), + num_symbols_in, + stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScanByKey( + temp_storage.data(), + total_temp_storage_bytes, + key_segments.data(), + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + cub::Equality{}, + stream)); + } else { + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( + temp_storage.data(), + total_temp_storage_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + stream)); + } // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = cub::DoubleBuffer{ diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 37c99453361..42036b79751 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -753,7 +753,7 @@ class TranslationOp { RelativeOffsetT const relative_offset, SymbolT const read_symbol) const { - return translation_op(*this, state_id, match_id, relative_offset, read_symbol); + return translation_op(state_id, match_id, relative_offset, read_symbol); } template @@ -761,7 +761,7 @@ class TranslationOp { SymbolIndexT const match_id, SymbolT const read_symbol) const { - return translation_op(*this, state_id, match_id, read_symbol); + return translation_op(state_id, match_id, read_symbol); } }; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 392a7850886..964e40e36cd 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -41,9 +41,9 @@ namespace cudf { namespace io { // Returns builder for csv_reader_options -csv_reader_options_builder csv_reader_options::builder(source_info const& src) +csv_reader_options_builder csv_reader_options::builder(source_info src) { - return csv_reader_options_builder{src}; + return csv_reader_options_builder{std::move(src)}; } // Returns builder for csv_writer_options @@ -54,9 +54,9 @@ csv_writer_options_builder csv_writer_options::builder(sink_info const& sink, } // Returns builder for orc_reader_options -orc_reader_options_builder orc_reader_options::builder(source_info const& src) +orc_reader_options_builder orc_reader_options::builder(source_info src) { - return orc_reader_options_builder{src}; + return orc_reader_options_builder{std::move(src)}; } // Returns builder for orc_writer_options @@ -73,15 +73,15 @@ chunked_orc_writer_options_builder chunked_orc_writer_options::builder(sink_info } // Returns builder for avro_reader_options -avro_reader_options_builder avro_reader_options::builder(source_info const& src) +avro_reader_options_builder avro_reader_options::builder(source_info src) { - return avro_reader_options_builder(src); + return avro_reader_options_builder(std::move(src)); } // Returns builder for json_reader_options -json_reader_options_builder json_reader_options::builder(source_info const& src) +json_reader_options_builder json_reader_options::builder(source_info src) { - return json_reader_options_builder(src); + return json_reader_options_builder(std::move(src)); } // Returns builder for orc_writer_options @@ -92,9 +92,9 @@ json_writer_options_builder json_writer_options::builder(sink_info const& sink, } // Returns builder for parquet_reader_options -parquet_reader_options_builder parquet_reader_options::builder(source_info const& src) +parquet_reader_options_builder parquet_reader_options::builder(source_info src) { - return parquet_reader_options_builder{src}; + return parquet_reader_options_builder{std::move(src)}; } // Returns builder for parquet_writer_options @@ -200,7 +200,9 @@ compression_type infer_compression_type(compression_type compression, source_inf return compression_type::NONE; } -table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr) +table_with_metadata read_json(json_reader_options options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -210,10 +212,12 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor options.get_byte_range_offset(), options.get_byte_range_size_with_padding()); - return json::detail::read_json(datasources, options, cudf::get_default_stream(), mr); + return json::detail::read_json(datasources, options, stream, mr); } -void write_json(json_writer_options const& options, rmm::mr::device_memory_resource* mr) +void write_json(json_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto sinks = make_datasinks(options.get_sink()); CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing"); @@ -222,11 +226,13 @@ void write_json(json_writer_options const& options, rmm::mr::device_memory_resou sinks[0].get(), options.get_table(), options, - cudf::get_default_stream(), + stream, mr); } -table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(csv_reader_options options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -241,12 +247,14 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_ return cudf::io::detail::csv::read_csv( // std::move(datasources[0]), options, - cudf::get_default_stream(), + stream, mr); } // Freeform API wraps the detail writer class API -void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr) +void write_csv(csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace cudf::io::detail; @@ -258,7 +266,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc options.get_table(), options.get_names(), options, - cudf::get_default_stream(), + stream, mr); } @@ -470,8 +478,8 @@ void orc_chunked_writer::close() writer->close(); } -using namespace cudf::io::detail::parquet; -namespace detail_parquet = cudf::io::detail::parquet; +using namespace cudf::io::parquet::detail; +namespace detail_parquet = cudf::io::parquet::detail; table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 3bbfc4b5f83..8d89f4ff927 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 06ac11485cb..496e5b25e60 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -91,6 +91,98 @@ void check_input_size(std::size_t input_size) namespace cudf::io::json { +// FST to help fixing the stack context of characters that follow the first record on each JSON line +namespace fix_stack_of_excess_chars { + +// Type used to represent the target state in the transition table +using StateT = char; + +// Type used to represent a symbol group id +using SymbolGroupT = uint8_t; + +/** + * @brief Definition of the DFA's states. + */ +enum class dfa_states : StateT { + // Before the first record on the JSON line + BEFORE, + // Within the first record on the JSON line + WITHIN, + // Excess data that follows the first record on the JSON line + EXCESS, + // Total number of states + NUM_STATES +}; + +/** + * @brief Definition of the symbol groups + */ +enum class dfa_symbol_group_id : SymbolGroupT { + ROOT, ///< Symbol for root stack context + DELIMITER, ///< Line delimiter symbol group + OTHER, ///< Symbol group that implicitly matches all other tokens + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +constexpr auto TT_NUM_STATES = static_cast(dfa_states::NUM_STATES); +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + +/** + * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group. + */ +struct SymbolPairToSymbolGroupId { + CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple symbol) const + { + auto const input_symbol = thrust::get<0>(symbol); + auto const stack_symbol = thrust::get<1>(symbol); + return static_cast( + input_symbol == '\n' + ? dfa_symbol_group_id::DELIMITER + : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER)); + } +}; + +/** + * @brief Translation function object that fixes the stack context of excess data that follows after + * the first JSON record on each line. + */ +struct TransduceInputOp { + template + constexpr CUDF_HOST_DEVICE StackSymbolT operator()(StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + if (state_id == static_cast(dfa_states::EXCESS)) { return '_'; } + return thrust::get<1>(read_symbol); + } + + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + constexpr int32_t single_output_item = 1; + return single_output_item; + } +}; + +// Aliases for readability of the transition table +constexpr auto TT_BEFORE = dfa_states::BEFORE; +constexpr auto TT_INSIDE = dfa_states::WITHIN; +constexpr auto TT_EXCESS = dfa_states::EXCESS; + +// Transition table +std::array, TT_NUM_STATES> constexpr transition_table{ + {/* IN_STATE ROOT NEWLINE OTHER */ + /* TT_BEFORE */ {{TT_BEFORE, TT_BEFORE, TT_INSIDE}}, + /* TT_INSIDE */ {{TT_EXCESS, TT_BEFORE, TT_INSIDE}}, + /* TT_EXCESS */ {{TT_EXCESS, TT_BEFORE, TT_EXCESS}}}}; + +// The DFA's starting state +constexpr auto start_state = static_cast(dfa_states::BEFORE); +} // namespace fix_stack_of_excess_chars + // FST to prune tokens of invalid lines for recovering JSON lines format namespace token_filter { @@ -146,9 +238,8 @@ struct UnwrapTokenFromSymbolOp { * invalid lines. */ struct TransduceToken { - template - constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&, - StateT const state_id, + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, SymbolGroupT const match_id, RelativeOffsetT const relative_offset, SymbolT const read_symbol) const @@ -165,9 +256,8 @@ struct TransduceToken { } } - template - constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&, - StateT const state_id, + template + constexpr CUDF_HOST_DEVICE int32_t operator()(StateT const state_id, SymbolGroupT const match_id, SymbolT const read_symbol) const { @@ -253,27 +343,35 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NU std::array const symbol_groups{ {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}}; -// Transition table +// Transition table for the default JSON and JSON lines formats std::array, TT_NUM_STATES> const transition_table{ {/* IN_STATE { [ } ] " \ \n OTHER */ /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}}, /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}}, /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; -// Translation table (i.e., for each transition, what are the symbols that we output) +// Transition table for the JSON lines format that recovers from invalid JSON lines +std::array, TT_NUM_STATES> const + resetting_transition_table{ + {/* IN_STATE { [ } ] " \ \n OTHER */ + /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}}, + /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}}; + +// Translation table for the default JSON and JSON lines formats std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ {/* IN_STATE { [ } ] " \ \n OTHER */ /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}}, /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}, {}}}, /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}}; -// Translation table +// Translation table for the JSON lines format that recovers from invalid JSON lines std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const resetting_translation_table{ {/* IN_STATE { [ } ] " \ \n OTHER */ /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}}, - /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}, {}}}, - /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}}; + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_OOS); @@ -643,6 +741,11 @@ auto get_transition_table(json_format_cfg_t format) // PD_ANL describes the target state after a new line after encountering error state auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR; + // Target state after having parsed the first JSON value on a JSON line + // Spark has the special need to ignore everything that comes after the first JSON object + // on a JSON line instead of marking those as invalid + auto const PD_AFS = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_PVL : PD_ERR; + // First row: empty stack ("root" level of the JSON) // Second row: '[' on top of stack (we're parsing a list value) // Third row: '{' on top of stack (we're parsing a struct value) @@ -660,15 +763,15 @@ auto get_transition_table(json_format_cfg_t format) PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_LON, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_LON}; pda_tt[static_cast(pda_state_t::PD_STR)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR}; pda_tt[static_cast(pda_state_t::PD_SCE)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_BOV, PD_STR}; pda_tt[static_cast(pda_state_t::PD_PVL)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR, + PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_AFS, PD_PVL, PD_BOV, PD_AFS, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR}; pda_tt[static_cast(pda_state_t::PD_BFN)] = { @@ -680,9 +783,9 @@ auto get_transition_table(json_format_cfg_t format) PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_FNE)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN}; pda_tt[static_cast(pda_state_t::PD_PFN)] = { PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, @@ -697,8 +800,11 @@ auto get_transition_table(json_format_cfg_t format) /** * @brief Getting the translation table + * @param recover_from_error Whether or not the tokenizer should recover from invalid lines. If + * `recover_from_error` is true, invalid JSON lines end with the token sequence (`ErrorBegin`, + * `LineEn`) and incomplete JSON lines (e.g., `{"a":123\n`) are treated as invalid lines. */ -auto get_translation_table(bool include_line_delimiter) +auto get_translation_table(bool recover_from_error) { constexpr auto StructBegin = token_t::StructBegin; constexpr auto StructEnd = token_t::StructEnd; @@ -715,76 +821,95 @@ auto get_translation_table(bool include_line_delimiter) constexpr auto ErrorBegin = token_t::ErrorBegin; /** - * @brief Appends token_t::LineEnd token to the given token sequence, if and only if - * `include_line_delimiter` is true. + * @brief Instead of specifying the verbose translation tables twice (i.e., once when + * `recover_from_error` is true and once when it is false), we use `nl_tokens` to specialize the + * translation table where it differs depending on the `recover_from_error` option. If and only if + * `recover_from_error` is true, `recovering_tokens` are returned along with a token_t::LineEnd + * token, otherwise `regular_tokens` is returned. + */ + auto nl_tokens = [recover_from_error](std::vector regular_tokens, + std::vector recovering_tokens) { + if (recover_from_error) { + recovering_tokens.push_back(token_t::LineEnd); + return recovering_tokens; + } + return regular_tokens; + }; + + /** + * @brief Helper function that returns `recovering_tokens` if `recover_from_error` is true and + * returns `regular_tokens` otherwise. This is used to ignore excess characters after the first + * value in the case of JSON lines that recover from invalid lines, as Spark ignores any excess + * characters that follow the first record on a JSON line. */ - auto nl_tokens = [include_line_delimiter](std::vector tokens) { - if (include_line_delimiter) { tokens.push_back(token_t::LineEnd); } - return tokens; + auto alt_tokens = [recover_from_error](std::vector regular_tokens, + std::vector recovering_tokens) { + if (recover_from_error) { return recovering_tokens; } + return regular_tokens; }; std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; - pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ + pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {ValueBegin}, // OTHER + /*LIST*/ {StructBegin}, // OPENING_BRACE {ListBegin}, // OPENING_BRACKET {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET + {ListEnd}, // CLOSING_BRACKET {StringBegin}, // QUOTE {ErrorBegin}, // ESCAPE {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}, // OTHER - /*LIST*/ + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ValueBegin}, // OTHER + /*STRUCT*/ {StructBegin}, // OPENING_BRACE {ListBegin}, // OPENING_BRACKET {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // CLOSING_BRACKET {StringBegin}, // QUOTE {ErrorBegin}, // ESCAPE {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}, // OTHER - /*STRUCT*/ - {StructBegin}, // OPENING_BRACE - {ListBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {StringBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}}}; // OTHER + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ValueBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BOA)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {StructBegin}, // OPENING_BRACE - {ListBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {StringBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ValueBegin}, // OTHER + {StructBegin}, // OPENING_BRACE + {ListBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {StringBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ValueBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -795,33 +920,33 @@ auto get_translation_table(bool include_line_delimiter) {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK + nl_tokens({}, {ErrorBegin}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_LON)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK - {}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ValueEnd, ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK - {}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ValueEnd, ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -832,108 +957,108 @@ auto get_translation_table(bool include_line_delimiter) {ValueEnd, StructMemberEnd}, // COMMA {ErrorBegin}, // COLON {ValueEnd}, // WHITE_SPACE - nl_tokens({ValueEnd}), // LINE_BREAK + nl_tokens({ValueEnd}, {ErrorBegin}), // LINE_BREAK {}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_PVL)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {alt_tokens({ErrorBegin}, {})}, // OPENING_BRACE + {alt_tokens({ErrorBegin}, {})}, // OPENING_BRACKET + {alt_tokens({ErrorBegin}, {})}, // CLOSING_BRACE + {alt_tokens({ErrorBegin}, {})}, // CLOSING_BRACKET + {alt_tokens({ErrorBegin}, {})}, // QUOTE + {alt_tokens({ErrorBegin}, {})}, // ESCAPE + {alt_tokens({ErrorBegin}, {})}, // COMMA + {alt_tokens({ErrorBegin}, {})}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {alt_tokens({ErrorBegin}, {})}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -944,34 +1069,34 @@ auto get_translation_table(bool include_line_delimiter) {StructMemberEnd}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK + nl_tokens({}, {ErrorBegin}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BFN)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -982,156 +1107,159 @@ auto get_translation_table(bool include_line_delimiter) {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK + nl_tokens({}, {ErrorBegin}), // LINE_BREAK {ErrorBegin}}}; // OTHER - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {FieldNameEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - nl_tokens({ErrorBegin}), // LINE_BREAK - {ErrorBegin}, // OTHER - /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {ErrorBegin}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + pda_tlt[static_cast(pda_state_t::PD_FLN)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FNE)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PFN)] = { + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*LIST*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER + /*STRUCT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {ErrorBegin}), // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - nl_tokens({}), // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}, {}), // LINE_BREAK + {}}}; // OTHER return pda_tlt; } @@ -1295,14 +1423,19 @@ void get_stack_context(device_span json_in, constexpr auto max_translation_table_size = to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES; - // Translation table specialized on the choice of whether to reset on newlines outside of strings + // Transition table specialized on the choice of whether to reset on newlines + const auto transition_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter) + ? to_stack_op::resetting_transition_table + : to_stack_op::transition_table; + + // Translation table specialized on the choice of whether to reset on newlines const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter) ? to_stack_op::resetting_translation_table : to_stack_op::translation_table; auto json_to_stack_ops_fst = fst::detail::make_fst( fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups), - fst::detail::make_transition_table(to_stack_op::transition_table), + fst::detail::make_transition_table(transition_table), fst::detail::make_translation_table(translation_table), stream); @@ -1321,7 +1454,7 @@ void get_stack_context(device_span json_in, // Stack operations with indices are converted to top of the stack for each character in the input if (stack_behavior == stack_behavior_t::ResetOnDelimiter) { - fst::sparse_stack_op_to_top_of_stack( + fst::sparse_stack_op_to_top_of_stack( stack_ops.data(), device_span{stack_op_indices.data(), num_stack_ops}, JSONWithRecoveryToStackOp{}, @@ -1331,7 +1464,7 @@ void get_stack_context(device_span json_in, json_in.size(), stream); } else { - fst::sparse_stack_op_to_top_of_stack( + fst::sparse_stack_op_to_top_of_stack( stack_ops.data(), device_span{stack_op_indices.data(), num_stack_ops}, JSONToStackOp{}, @@ -1433,6 +1566,26 @@ std::pair, rmm::device_uvector> ge // character. auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_symbols.data()); + // Spark, as the main stakeholder in the `recover_from_error` option, has the specific need to + // ignore any characters that follow the first value on each JSON line. This is an FST that + // fixes the stack context for those excess characters. That is, that all those excess characters + // will be interpreted in the root stack context + if (recover_from_error) { + auto fix_stack_of_excess_chars = fst::detail::make_fst( + fst::detail::make_symbol_group_lookup_op( + fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}), + fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table), + fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}), + stream); + fix_stack_of_excess_chars.Transduce(zip_in, + static_cast(json_in.size()), + stack_symbols.data(), + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), + fix_stack_of_excess_chars::start_state, + stream); + } + constexpr auto max_translation_table_size = tokenizer_pda::NUM_PDA_SGIDS * static_cast(tokenizer_pda::pda_state_t::PD_NUM_STATES); diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 2d363c51fce..c211d17f13a 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -504,6 +504,12 @@ struct column_to_strings_fn { { } + ~column_to_strings_fn() = default; + column_to_strings_fn(column_to_strings_fn const&) = delete; + column_to_strings_fn& operator=(column_to_strings_fn const&) = delete; + column_to_strings_fn(column_to_strings_fn&&) = delete; + column_to_strings_fn& operator=(column_to_strings_fn&&) = delete; + // unsupported type of column: template std::enable_if_t(), std::unique_ptr> operator()( @@ -614,17 +620,18 @@ struct column_to_strings_fn { auto child_string_with_null = [&]() { if (child_view.type().id() == type_id::STRUCT) { - return (*this).template operator()( - child_view, - children_names.size() > child_index ? children_names[child_index].children - : std::vector{}); - } else if (child_view.type().id() == type_id::LIST) { - return (*this).template operator()(child_view, + return this->template operator()(child_view, children_names.size() > child_index ? children_names[child_index].children : std::vector{}); + } else if (child_view.type().id() == type_id::LIST) { + return this->template operator()(child_view, + children_names.size() > child_index + ? children_names[child_index].children + : std::vector{}); } else { - return cudf::type_dispatcher(child_view.type(), *this, child_view); + return cudf::type_dispatcher( + child_view.type(), *this, child_view); } }; auto new_offsets = cudf::lists::detail::get_normalized_offsets( @@ -679,27 +686,29 @@ struct column_to_strings_fn { // auto i_col_begin = thrust::make_zip_iterator(thrust::counting_iterator(0), column_begin); - std::transform(i_col_begin, - i_col_begin + num_columns, - std::back_inserter(str_column_vec), - [this, &children_names](auto const& i_current_col) { - auto const i = thrust::get<0>(i_current_col); - auto const& current_col = thrust::get<1>(i_current_col); - // Struct needs children's column names - if (current_col.type().id() == type_id::STRUCT) { - return (*this).template operator()( - current_col, - children_names.size() > i ? children_names[i].children - : std::vector{}); - } else if (current_col.type().id() == type_id::LIST) { - return (*this).template operator()( - current_col, - children_names.size() > i ? children_names[i].children - : std::vector{}); - } else { - return cudf::type_dispatcher(current_col.type(), *this, current_col); - } - }); + std::transform( + i_col_begin, + i_col_begin + num_columns, + std::back_inserter(str_column_vec), + [this, &children_names](auto const& i_current_col) { + auto const i = thrust::get<0>(i_current_col); + auto const& current_col = thrust::get<1>(i_current_col); + // Struct needs children's column names + if (current_col.type().id() == type_id::STRUCT) { + return this->template operator()(current_col, + children_names.size() > i + ? children_names[i].children + : std::vector{}); + } else if (current_col.type().id() == type_id::LIST) { + return this->template operator()(current_col, + children_names.size() > i + ? children_names[i].children + : std::vector{}); + } else { + return cudf::type_dispatcher( + current_col.type(), *this, current_col); + } + }); // create string table view from str_column_vec: // diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index bc399b75ef9..ee5fa4e8b5a 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen) field_reader(5, s.minimum_nanos), field_reader(6, s.maximum_nanos)); function_builder(s, maxlen, op); + + // Adjust nanoseconds because they are encoded as (value + 1) + // Range [1, 1000'000] is translated here to [0, 999'999] + if (s.minimum_nanos.has_value()) { + auto& min_nanos = s.minimum_nanos.value(); + CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds"); + --min_nanos; + } + if (s.maximum_nanos.has_value()) { + auto& max_nanos = s.maximum_nanos.value(); + CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds"); + --max_nanos; + } } void ProtobufReader::read(column_statistics& s, size_t maxlen) diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 6f65e384d2d..783ed4206b6 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3; // Seconds from January 1st, 1970 to January 1st, 2015 static constexpr int64_t orc_utc_epoch = 1420070400; +// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max +// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder, +// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1). +static constexpr int32_t DEFAULT_MIN_NANOS = 0; +static constexpr int32_t DEFAULT_MAX_NANOS = 999'999; + struct PostScript { uint64_t footerLength = 0; // the length of the footer section in bytes CompressionKind compression = NONE; // the kind of generic compression used diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index dba7a9ffda5..243704b65d4 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -150,7 +150,8 @@ struct EncChunk { uint8_t dtype_len; // data type length int32_t scale; // scale for decimals or timestamps - uint32_t* dict_index; // dictionary index from row index + uint32_t* dict_index; // dictionary index from row index + uint32_t* dict_data_order; // map from data to sorted data indices uint32_t* decimal_offsets; orc_column_device_view const* column; }; @@ -191,11 +192,12 @@ struct stripe_dictionary { size_type num_rows = 0; // number of rows in the stripe // output - device_span data; // index of elements in the column to include in the dictionary - device_span index; // index into the dictionary for each row in the column - size_type entry_count = 0; // number of entries in the dictionary - size_type char_count = 0; // number of characters in the dictionary - bool is_enabled = false; // true if dictionary encoding is enabled for this stripe + device_span data; // index of elements in the column to include in the dictionary + device_span index; // index into the dictionary for each row in the column + device_span data_order; // map from data to sorted data indices + size_type entry_count = 0; // number of entries in the dictionary + size_type char_count = 0; // number of characters in the dictionary + bool is_enabled = false; // true if dictionary encoding is enabled for this stripe }; /** @@ -424,6 +426,20 @@ void rowgroup_char_counts(device_2dspan counts, device_span str_col_indexes, rmm::cuda_stream_view stream); +/** + * @brief Converts sizes of decimal elements to offsets within the rowgroup. + * + * @note The conversion is done in-place. After the conversion, the device vectors in \p elem_sizes + * hold the offsets. + * + * @param rg_bounds Ranges of rows in each rowgroup [rowgroup][column] + * @param elem_sizes Map between column indexes and decimal element sizes + * @param stream CUDA stream used for device memory operations and kernel launches + */ +void decimal_sizes_to_offsets(device_2dspan rg_bounds, + std::map>& elem_sizes, + rmm::cuda_stream_view stream); + /** * @brief Launches kernels to initialize statistics collection * diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 95f1db5bfd1..429fd5b929d 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu { using strings::detail::fixed_point_string_size; +// Nanosecond statistics should not be enabled until the spec version is set correctly in the output +// files. See https://github.com/rapidsai/cudf/issues/14325 for more details +constexpr bool enable_nanosecond_statistics = false; + constexpr unsigned int init_threads_per_group = 32; constexpr unsigned int init_groups_per_block = 4; constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block; @@ -76,8 +80,8 @@ __global__ void __launch_bounds__(block_size, 1) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage temp_storage; - volatile uint32_t stats_size = 0; - auto t = threadIdx.x; + uint32_t stats_size = 0; + auto t = threadIdx.x; __syncthreads(); for (thread_index_type start = 0; start < statistics_count; start += block_size) { uint32_t stats_len = 0, stats_pos; @@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1) stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64); break; case dtype_timestamp64: - stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) + - 2 * (pb_fld_hdrlen + pb_fldlen_int32); + stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64); + if constexpr (enable_nanosecond_statistics) { + stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32); + } break; case dtype_float32: case dtype_float64: @@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch // optional sint64 maximumUtc = 4; // optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond - // precision optional int32 maximumNanos = 6; + // precision + // optional int32 maximumNanos = 6; // } if (s->chunk.has_minmax) { cur[0] = 9 * 8 + ProtofType::FIXEDLEN; @@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block) split_nanosecond_timestamp(s->chunk.max_value.i_val); // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC - cur = pb_put_int(cur, 1, min_ms); // minimum - cur = pb_put_int(cur, 2, max_ms); // maximum - cur = pb_put_int(cur, 3, min_ms); // minimumUtc - cur = pb_put_int(cur, 4, max_ms); // maximumUtc - cur = pb_put_int(cur, 5, min_ns_remainder); // minimumNanos - cur = pb_put_int(cur, 6, max_ns_remainder); // maximumNanos + cur = pb_put_int(cur, 1, min_ms); // minimum + cur = pb_put_int(cur, 2, max_ms); // maximum + cur = pb_put_int(cur, 3, min_ms); // minimumUtc + cur = pb_put_int(cur, 4, max_ms); // maximumUtc + + if constexpr (enable_nanosecond_statistics) { + if (min_ns_remainder != DEFAULT_MIN_NANOS) { + // using uint because positive values are not zigzag encoded + cur = pb_put_uint(cur, 5, min_ns_remainder + 1); // minimumNanos + } + if (max_ns_remainder != DEFAULT_MAX_NANOS) { + // using uint because positive values are not zigzag encoded + cur = pb_put_uint(cur, 6, max_ns_remainder + 1); // maximumNanos + } + } + fld_start[1] = cur - (fld_start + 2); } break; diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 3edcd3d83b2..0b249bbdafe 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -142,9 +142,7 @@ struct orcdec_state_s { * @param[in] base Pointer to raw byte stream data * @param[in] len Stream length in bytes */ -static __device__ void bytestream_init(volatile orc_bytestream_s* bs, - uint8_t const* base, - uint32_t len) +static __device__ void bytestream_init(orc_bytestream_s* bs, uint8_t const* base, uint32_t len) { uint32_t pos = (len > 0) ? static_cast(7 & reinterpret_cast(base)) : 0; bs->base = base - pos; @@ -160,8 +158,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s* bs, * @param[in] bs Byte stream input * @param[in] bytes_consumed Number of bytes that were consumed */ -static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs, - uint32_t bytes_consumed) +static __device__ void bytestream_flush_bytes(orc_bytestream_s* bs, uint32_t bytes_consumed) { uint32_t pos = bs->pos; uint32_t len = bs->len; @@ -197,7 +194,7 @@ static __device__ void bytestream_fill(orc_bytestream_s* bs, int t) * @param[in] pos Position in byte stream * @return byte */ -inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint8_t bytestream_readbyte(orc_bytestream_s* bs, int pos) { return bs->buf.u8[pos & (bytestream_buffer_size - 1)]; } @@ -209,7 +206,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int * @param[in] pos Position in byte stream * @result bits */ -inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint32_t bytestream_readu32(orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -224,7 +221,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int * @param[in] numbits number of bits * @return bits */ -inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint64_t bytestream_readu64(orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -245,9 +242,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs, - int bitpos, - uint32_t numbits) +inline __device__ uint32_t bytestream_readbits(orc_bytestream_s* bs, int bitpos, uint32_t numbits) { int idx = bitpos >> 5; uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123); @@ -263,9 +258,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs, - int bitpos, - uint32_t numbits) +inline __device__ uint64_t bytestream_readbits64(orc_bytestream_s* bs, int bitpos, uint32_t numbits) { int idx = bitpos >> 5; uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123); @@ -288,7 +281,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, uint32_t& result) @@ -304,7 +297,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, int32_t& result) @@ -321,7 +314,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, uint64_t& result) @@ -337,7 +330,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, int64_t& result) @@ -354,7 +347,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @return length of varint in bytes */ template -inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint32_t varint_length(orc_bytestream_s* bs, int pos) { if (bytestream_readbyte(bs, pos) > 0x7f) { uint32_t next32 = bytestream_readu32(bs, pos + 1); @@ -392,7 +385,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) * @return new position in byte stream buffer */ template -inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result) +inline __device__ int decode_base128_varint(orc_bytestream_s* bs, int pos, T& result) { uint32_t v = bytestream_readbyte(bs, pos++); if (v > 0x7f) { @@ -446,7 +439,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p /** * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals) */ -inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos) +inline __device__ __int128_t decode_varint128(orc_bytestream_s* bs, int pos) { auto byte = bytestream_readbyte(bs, pos++); __int128_t const sign_mask = -(int32_t)(byte & 1); @@ -463,7 +456,7 @@ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int /** * @brief Decodes an unsigned 32-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -474,7 +467,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint /** * @brief Decodes an unsigned 64-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -485,7 +478,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint /** * @brief Signed version of 32-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -496,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int3 /** * @brief Signed version of 64-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -514,7 +507,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int6 * @return number of values decoded */ template -inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -534,8 +527,8 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, * @return number of values decoded */ template -static __device__ uint32_t Integer_RLEv1( - orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t) +static __device__ uint32_t +Integer_RLEv1(orc_bytestream_s* bs, orc_rlev1_state_s* rle, T* vals, uint32_t maxvals, int t) { uint32_t numvals, numruns; if (t == 0) { @@ -642,8 +635,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { */ template static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, - volatile orc_rlev2_state_s* rle, - volatile T* vals, + orc_rlev2_state_s* rle, + T* vals, uint32_t maxvals, int t, bool has_buffered_values = false) @@ -883,7 +876,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, * * @return 32-bit value */ -inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos) +inline __device__ uint32_t rle8_read_bool32(uint32_t* vals, uint32_t bitpos) { uint32_t a = vals[(bitpos >> 5) + 0]; uint32_t b = vals[(bitpos >> 5) + 1]; @@ -903,11 +896,8 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bi * * @return number of values decoded */ -static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs, - volatile orc_byterle_state_s* rle, - volatile uint8_t* vals, - uint32_t maxvals, - int t) +static __device__ uint32_t +Byte_RLE(orc_bytestream_s* bs, orc_byterle_state_s* rle, uint8_t* vals, uint32_t maxvals, int t) { uint32_t numvals, numruns; int r, tr; @@ -1006,8 +996,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1, * @return number of values decoded */ static __device__ int Decode_Decimals(orc_bytestream_s* bs, - volatile orc_byterle_state_s* scratch, - volatile orcdec_state_s::values& vals, + orc_byterle_state_s* scratch, + orcdec_state_s::values& vals, int val_scale, int numvals, type_id dtype_id, @@ -1241,8 +1231,8 @@ __global__ void __launch_bounds__(block_size) } __syncthreads(); while (s->top.dict.dict_len > 0) { - uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len; - volatile uint32_t* vals = s->vals.u32; + uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len; + uint32_t* vals = s->vals.u32; bytestream_fill(&s->bs, t); __syncthreads(); if (is_rlev1(s->chunk.encoding_kind)) { @@ -1310,12 +1300,12 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s, min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x)); if (s->chunk.valid_map_base != nullptr) { // We have a present stream - uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); - auto r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); - uint32_t valid = (t < nrows && r < rmax) - ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 - : 0; - volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; + uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); + auto r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); + uint32_t valid = (t < nrows && r < rmax) + ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 + : 0; + auto* row_ofs_plus1 = (uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row; if (t < nrows) { row_ofs_plus1[t] = valid; } lengths_to_positions(row_ofs_plus1, nrows, t); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 73c41e2bbcd..b99826e070e 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -53,7 +54,7 @@ constexpr bool zero_pll_war = true; struct byterle_enc_state_s { uint32_t literal_run; uint32_t repeat_run; - volatile uint32_t rpt_map[(512 / 32) + 1]; + uint32_t rpt_map[(512 / 32) + 1]; }; struct intrle_enc_state_s { @@ -63,7 +64,7 @@ struct intrle_enc_state_s { uint32_t literal_w; uint32_t hdr_bytes; uint32_t pl_bytes; - volatile uint32_t delta_map[(512 / 32) + 1]; + uint32_t delta_map[(512 / 32) + 1]; }; struct strdata_enc_state_s { @@ -366,7 +367,7 @@ static __device__ uint32_t IntegerRLE( using block_reduce = cub::BlockReduce; uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; uint32_t out_cnt = 0; - __shared__ volatile uint64_t block_vmin; + __shared__ uint64_t block_vmin; while (numvals > 0) { T v0 = (t < numvals) ? inbuf[(inpos + t) & inmask] : 0; @@ -615,7 +616,7 @@ static __device__ void StoreStringData(uint8_t* dst, * @param[in] t thread id */ template -inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -836,6 +837,10 @@ __global__ void __launch_bounds__(block_size) if (dict_idx > 0x7fff'ffffu) { dict_idx = s->chunk.dict_index[dict_idx & 0x7fff'ffffu]; } + // translate dictionary index to sorted order, if enabled + if (s->chunk.dict_data_order != nullptr) { + dict_idx = s->chunk.dict_data_order[dict_idx]; + } s->vals.u32[nz_idx] = dict_idx; } else { string_view value = column.element(row); @@ -1143,7 +1148,7 @@ __global__ void __launch_bounds__(256) uint32_t comp_block_align) { __shared__ __align__(16) StripeStream ss; - __shared__ uint8_t* volatile uncomp_base_g; + __shared__ uint8_t* uncomp_base_g; auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align); auto const padded_comp_block_size = util::round_up_unsafe(max_comp_blk_size, comp_block_align); @@ -1196,8 +1201,8 @@ __global__ void __launch_bounds__(1024) uint32_t max_comp_blk_size) { __shared__ __align__(16) StripeStream ss; - __shared__ uint8_t const* volatile comp_src_g; - __shared__ uint32_t volatile comp_len_g; + __shared__ uint8_t const* comp_src_g; + __shared__ uint32_t comp_len_g; auto const stripe_id = blockIdx.x; auto const stream_id = blockIdx.y; @@ -1260,6 +1265,38 @@ __global__ void __launch_bounds__(1024) } } +// Holds a non-owning view of a decimal column's element sizes +struct decimal_column_element_sizes { + uint32_t col_idx; + device_span sizes; +}; + +// Converts sizes of individual decimal elements to offsets within each row group +// Conversion is done in-place +template +__global__ void decimal_sizes_to_offsets_kernel(device_2dspan rg_bounds, + device_span sizes) +{ + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; + int const t = threadIdx.x; + + auto const& col_elem_sizes = sizes[blockIdx.x]; + auto const& row_group = rg_bounds[blockIdx.y][col_elem_sizes.col_idx]; + auto const elem_sizes = col_elem_sizes.sizes.data() + row_group.begin; + + uint32_t initial_value = 0; + // Do a series of block sums, storing results in the array as we go + for (int64_t pos = 0; pos < row_group.size(); pos += block_size) { + auto const tidx = pos + t; + auto tval = tidx < row_group.size() ? elem_sizes[tidx] : 0u; + uint32_t block_sum = 0; + block_scan(scan_storage).InclusiveSum(tval, tval, block_sum); + if (tidx < row_group.size()) { elem_sizes[tidx] = tval + initial_value; } + initial_value += block_sum; + } +} + void EncodeOrcColumnData(device_2dspan chunks, device_2dspan streams, rmm::cuda_stream_view stream) @@ -1368,6 +1405,30 @@ std::optional CompressOrcDataStreams( } } +void decimal_sizes_to_offsets(device_2dspan rg_bounds, + std::map>& elem_sizes, + rmm::cuda_stream_view stream) +{ + if (rg_bounds.count() == 0) return; + + // Convert map to a vector of views of the `elem_sizes` device buffers + std::vector h_sizes; + h_sizes.reserve(elem_sizes.size()); + std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) { + return decimal_column_element_sizes{p.first, p.second}; + }); + + // Copy the vector of views to the device so that we can pass it to the kernel + auto d_sizes = cudf::detail::make_device_uvector_async( + h_sizes, stream, rmm::mr::get_current_device_resource()); + + constexpr int block_size = 256; + dim3 const grid_size{static_cast(elem_sizes.size()), // num decimal columns + static_cast(rg_bounds.size().first)}; // num rowgroups + decimal_sizes_to_offsets_kernel + <<>>(rg_bounds, d_sizes); +} + } // namespace gpu } // namespace orc } // namespace io diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 8eeca504121..b31a4a081d1 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -499,7 +499,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row; for (int j = t4; j < rowgroup_size4; j += 4) { ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] = - ((volatile uint32_t*)&s->rowgroups[i])[j]; + ((uint32_t*)&s->rowgroups[i])[j]; } row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows; // Updating in case of struct diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 3d8bdb4ec97..ac5993e764e 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,8 @@ #include #include #include +#include +#include #include #include @@ -867,16 +870,15 @@ encoded_data encode_columns(orc_table_view const& orc_table, ck.null_mask_num_rows = aligned_rowgroups[rg_idx][column.index()].size(); ck.encoding_kind = column.orc_encoding(); ck.type_kind = column.orc_kind(); - if (ck.type_kind == TypeKind::STRING) { - ck.dict_index = (ck.encoding_kind == DICTIONARY_V2) - ? column.host_stripe_dict(stripe.id).index.data() - : nullptr; - ck.dtype_len = 1; - } else { - ck.dtype_len = column.type_width(); - } - ck.scale = column.scale(); - if (ck.type_kind == TypeKind::DECIMAL) { ck.decimal_offsets = column.decimal_offsets(); } + auto const is_str_dict = + ck.type_kind == TypeKind::STRING and ck.encoding_kind == DICTIONARY_V2; + ck.dict_index = is_str_dict ? column.host_stripe_dict(stripe.id).index.data() : nullptr; + ck.dict_data_order = + is_str_dict ? column.host_stripe_dict(stripe.id).data_order.data() : nullptr; + ck.dtype_len = (ck.type_kind == TypeKind::STRING) ? 1 : column.type_width(); + ck.scale = column.scale(); + ck.decimal_offsets = + (ck.type_kind == TypeKind::DECIMAL) ? column.decimal_offsets() : nullptr; } } } @@ -1882,7 +1884,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, auto& current_sizes = elem_sizes.insert({orc_col.index(), rmm::device_uvector(orc_col.size(), stream)}) .first->second; - thrust::tabulate(rmm::exec_policy(stream), + thrust::tabulate(rmm::exec_policy_nosync(stream), current_sizes.begin(), current_sizes.end(), [d_cols = device_span{orc_table.d_columns}, @@ -1908,25 +1910,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, return varint_size(zigzaged_value); }); - // Compute element offsets within each row group - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0ul), - segmentation.num_rowgroups(), - [sizes = device_span{current_sizes}, - rg_bounds = device_2dspan{segmentation.rowgroups}, - col_idx = orc_col.index()] __device__(auto rg_idx) { - auto const& range = rg_bounds[rg_idx][col_idx]; - thrust::inclusive_scan(thrust::seq, - sizes.begin() + range.begin, - sizes.begin() + range.end, - sizes.begin() + range.begin); - }); - orc_col.attach_decimal_offsets(current_sizes.data()); } } if (elem_sizes.empty()) return {}; + // Compute element offsets within each row group + gpu::decimal_sizes_to_offsets(segmentation.rowgroups, elem_sizes, stream); + // Gather the row group sizes and copy to host auto d_tmp_rowgroup_sizes = rmm::device_uvector(segmentation.num_rowgroups(), stream); std::map> rg_sizes; @@ -2023,24 +2014,41 @@ struct stripe_dictionaries { hostdevice_2dvector views; // descriptors [string_column][stripe] std::vector> data_owner; // dictionary data owner, per stripe std::vector> index_owner; // dictionary index owner, per stripe + std::vector> order_owner; // dictionary order owner, per stripe // Should be called after encoding is complete to deallocate the dictionary buffers. void on_encode_complete(rmm::cuda_stream_view stream) { data_owner.clear(); index_owner.clear(); + order_owner.clear(); for (auto& sd : views.host_view().flat_view()) { - sd.data = {}; - sd.index = {}; + sd.data = {}; + sd.index = {}; + sd.data_order = {}; } views.host_to_device_async(stream); } }; +/** + * @brief Compares two rows in a strings column + */ +struct string_rows_less { + device_span cols; + uint32_t col_idx; + __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const + { + auto const& col = cols[col_idx]; + return col.element(lhs_idx) < col.element(rhs_idx); + } +}; + // Build stripe dictionaries for string columns stripe_dictionaries build_dictionaries(orc_table_view& orc_table, file_segmentation const& segmentation, + bool sort_dictionaries, rmm::cuda_stream_view stream) { std::vector>> hash_maps_storage( @@ -2091,6 +2099,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, // Data owners; can be cleared after encode std::vector> dict_data_owner; std::vector> dict_index_owner; + std::vector> dict_order_owner; // Make decision about which stripes to encode with dictionary encoding for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); @@ -2133,15 +2142,61 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, gpu::collect_map_entries(stripe_dicts, stream); gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream); - // Clear map slots; hash map storage is deallocated at the end of this function - auto device_dicts_flat = stripe_dicts.device_view().flat_view(); - thrust::for_each(rmm::exec_policy(stream), - device_dicts_flat.begin(), - device_dicts_flat.end(), - [] __device__(auto& sd) { sd.map_slots = {}; }); - stripe_dicts.device_to_host_async(stream); + // deallocate hash map storage, unused after this point + hash_maps_storage.clear(); + + // Clear map slots and attach order buffers + auto dictionaries_flat = stripe_dicts.host_view().flat_view(); + for (auto& sd : dictionaries_flat) { + if (not sd.is_enabled) { continue; } + + sd.map_slots = {}; + if (sort_dictionaries) { + dict_order_owner.emplace_back(sd.entry_count, stream); + sd.data_order = dict_order_owner.back(); + } else { + sd.data_order = {}; + } + } + stripe_dicts.host_to_device_async(stream); + + // Sort stripe dictionaries alphabetically + if (sort_dictionaries) { + auto streams = cudf::detail::fork_streams(stream, std::min(dict_order_owner.size(), 8)); + auto stream_idx = 0; + for (auto& sd : dictionaries_flat) { + if (not sd.is_enabled) { continue; } + + auto const& current_stream = streams[stream_idx]; + + // Sort the dictionary data and create a mapping from the sorted order to the original + thrust::sequence( + rmm::exec_policy_nosync(current_stream), sd.data_order.begin(), sd.data_order.end()); + thrust::sort_by_key(rmm::exec_policy_nosync(current_stream), + sd.data.begin(), + sd.data.end(), + sd.data_order.begin(), + string_rows_less{orc_table.d_columns, sd.column_idx}); + + // Create the inverse permutation - i.e. the mapping from the original order to the sorted + auto order_copy = cudf::detail::make_device_uvector_async( + sd.data_order, current_stream, rmm::mr::get_current_device_resource()); + thrust::scatter(rmm::exec_policy_nosync(current_stream), + thrust::counting_iterator(0), + thrust::counting_iterator(sd.data_order.size()), + order_copy.begin(), + sd.data_order.begin()); + + stream_idx = (stream_idx + 1) % streams.size(); + } + + cudf::detail::join_streams(streams, stream); + } - return {std::move(stripe_dicts), std::move(dict_data_owner), std::move(dict_index_owner)}; + return {std::move(stripe_dicts), + std::move(dict_data_owner), + std::move(dict_index_owner), + std::move(dict_order_owner)}; } /** @@ -2153,6 +2208,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, * @param max_stripe_size Maximum size of stripes in the output file * @param row_index_stride The row index stride * @param enable_dictionary Whether dictionary is enabled + * @param sort_dictionaries Whether to sort the dictionaries * @param compression_kind The compression kind * @param compression_blocksize The block size used for compression * @param stats_freq Column statistics granularity type for parquet/orc writers @@ -2167,6 +2223,7 @@ auto convert_table_to_orc_data(table_view const& input, stripe_size_limits max_stripe_size, size_type row_index_stride, bool enable_dictionary, + bool sort_dictionaries, CompressionKind compression_kind, size_t compression_blocksize, statistics_freq stats_freq, @@ -2191,7 +2248,7 @@ auto convert_table_to_orc_data(table_view const& input, auto segmentation = calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size); - auto stripe_dicts = build_dictionaries(orc_table, segmentation, stream); + auto stripe_dicts = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream); auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); auto const uncompressed_block_align = uncomp_block_alignment(compression_kind); @@ -2325,6 +2382,7 @@ writer::impl::impl(std::unique_ptr sink, _compression_blocksize(compression_block_size(_compression_kind)), _compression_statistics(options.get_compression_statistics()), _stats_freq(options.get_statistics_freq()), + _sort_dictionaries{options.get_enable_dictionary_sort()}, _single_write_mode(mode), _kv_meta(options.get_key_value_metadata()), _out_sink(std::move(sink)) @@ -2346,6 +2404,7 @@ writer::impl::impl(std::unique_ptr sink, _compression_blocksize(compression_block_size(_compression_kind)), _compression_statistics(options.get_compression_statistics()), _stats_freq(options.get_statistics_freq()), + _sort_dictionaries{options.get_enable_dictionary_sort()}, _single_write_mode(mode), _kv_meta(options.get_key_value_metadata()), _out_sink(std::move(sink)) @@ -2393,6 +2452,7 @@ void writer::impl::write(table_view const& input) _max_stripe_size, _row_index_stride, _enable_dictionary, + _sort_dictionaries, _compression_kind, _compression_blocksize, _stats_freq, diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 67c65eb9a37..0d1a83f3d85 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -346,6 +346,7 @@ class writer::impl { size_t const _compression_blocksize; std::shared_ptr _compression_statistics; // Optional output statistics_freq const _stats_freq; + bool const _sort_dictionaries; single_write_mode const _single_write_mode; // Special parameter only used by `write()` to // indicate that we are guaranteeing a single table // write. This enables some internal optimizations. diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 9ff1869edde..53ff31ab0a7 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -24,10 +24,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } @@ -101,7 +99,7 @@ struct map_find_fn { template __global__ void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) + populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size) template __global__ void __launch_bounds__(block_size) - get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) + get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st <<>>(chunks); } -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); @@ -290,14 +288,11 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi collect_map_entries_kernel<<>>(chunks); } -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel <<>>(frags); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 5c7b8ca3f8c..5a2b8aa8f2a 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -21,9 +21,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Base class for parquet field functors. @@ -341,61 +339,6 @@ struct parquet_field_struct_list : public parquet_field_list { } }; -// TODO(ets): replace current union handling (which mirrors thrift) to use std::optional fields -// in a struct -/** - * @brief Functor to read a union member from CompactProtocolReader - * - * @tparam is_empty True if tparam `T` type is empty type, else false. - * - * @return True if field types mismatch or if the process of reading a - * union member fails - */ -template -class ParquetFieldUnionFunctor : public parquet_field { - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - return !cpr->read(&val); - } - } -}; - -template -class ParquetFieldUnionFunctor : public parquet_field { - bool& is_set; - T& val; - - public: - ParquetFieldUnionFunctor(int f, bool& b, T& v) : parquet_field(f), is_set(b), val(v) {} - - inline bool operator()(CompactProtocolReader* cpr, int field_type) - { - if (field_type != ST_FLD_STRUCT) { - return true; - } else { - is_set = true; - cpr->skip_struct_field(field_type); - return false; - } - } -}; - -template -ParquetFieldUnionFunctor> ParquetFieldUnion(int f, bool& b, T& v) -{ - return ParquetFieldUnionFunctor>(f, b, v); -} - /** * @brief Functor to read a binary from CompactProtocolReader * @@ -597,34 +540,38 @@ bool CompactProtocolReader::read(FileMetaData* f) bool CompactProtocolReader::read(SchemaElement* s) { + using optional_converted_type = + parquet_field_optional>; + using optional_logical_type = + parquet_field_optional>; auto op = std::make_tuple(parquet_field_enum(1, s->type), parquet_field_int32(2, s->type_length), parquet_field_enum(3, s->repetition_type), parquet_field_string(4, s->name), parquet_field_int32(5, s->num_children), - parquet_field_enum(6, s->converted_type), + optional_converted_type(6, s->converted_type), parquet_field_int32(7, s->decimal_scale), parquet_field_int32(8, s->decimal_precision), parquet_field_optional(9, s->field_id), - parquet_field_struct(10, s->logical_type)); + optional_logical_type(10, s->logical_type)); return function_builder(this, op); } bool CompactProtocolReader::read(LogicalType* l) { - auto op = - std::make_tuple(ParquetFieldUnion(1, l->isset.STRING, l->STRING), - ParquetFieldUnion(2, l->isset.MAP, l->MAP), - ParquetFieldUnion(3, l->isset.LIST, l->LIST), - ParquetFieldUnion(4, l->isset.ENUM, l->ENUM), - ParquetFieldUnion(5, l->isset.DECIMAL, l->DECIMAL), // read the struct - ParquetFieldUnion(6, l->isset.DATE, l->DATE), - ParquetFieldUnion(7, l->isset.TIME, l->TIME), // read the struct - ParquetFieldUnion(8, l->isset.TIMESTAMP, l->TIMESTAMP), // read the struct - ParquetFieldUnion(10, l->isset.INTEGER, l->INTEGER), // read the struct - ParquetFieldUnion(11, l->isset.UNKNOWN, l->UNKNOWN), - ParquetFieldUnion(12, l->isset.JSON, l->JSON), - ParquetFieldUnion(13, l->isset.BSON, l->BSON)); + auto op = std::make_tuple( + parquet_field_union_enumerator(1, l->type), + parquet_field_union_enumerator(2, l->type), + parquet_field_union_enumerator(3, l->type), + parquet_field_union_enumerator(4, l->type), + parquet_field_union_struct(5, l->type, l->decimal_type), + parquet_field_union_enumerator(6, l->type), + parquet_field_union_struct(7, l->type, l->time_type), + parquet_field_union_struct(8, l->type, l->timestamp_type), + parquet_field_union_struct(10, l->type, l->int_type), + parquet_field_union_enumerator(11, l->type), + parquet_field_union_enumerator(12, l->type), + parquet_field_union_enumerator(13, l->type)); return function_builder(this, op); } @@ -650,9 +597,9 @@ bool CompactProtocolReader::read(TimestampType* t) bool CompactProtocolReader::read(TimeUnit* u) { - auto op = std::make_tuple(ParquetFieldUnion(1, u->isset.MILLIS, u->MILLIS), - ParquetFieldUnion(2, u->isset.MICROS, u->MICROS), - ParquetFieldUnion(3, u->isset.NANOS, u->NANOS)); + auto op = std::make_tuple(parquet_field_union_enumerator(1, u->type), + parquet_field_union_enumerator(2, u->type), + parquet_field_union_enumerator(3, u->type)); return function_builder(this, op); } @@ -769,12 +716,15 @@ bool CompactProtocolReader::read(ColumnIndex* c) bool CompactProtocolReader::read(Statistics* s) { - auto op = std::make_tuple(parquet_field_binary(1, s->max), - parquet_field_binary(2, s->min), - parquet_field_int64(3, s->null_count), - parquet_field_int64(4, s->distinct_count), - parquet_field_binary(5, s->max_value), - parquet_field_binary(6, s->min_value)); + using optional_binary = parquet_field_optional, parquet_field_binary>; + using optional_int64 = parquet_field_optional; + + auto op = std::make_tuple(optional_binary(1, s->max), + optional_binary(2, s->min), + optional_int64(3, s->null_count), + optional_int64(4, s->distinct_count), + optional_binary(5, s->max_value), + optional_binary(6, s->min_value)); return function_builder(this, op); } @@ -870,6 +820,4 @@ int CompactProtocolReader::WalkSchema( } } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 619815db503..cbb4161b138 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -25,9 +25,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata * @@ -147,6 +146,4 @@ class CompactProtocolReader { friend class parquet_field_struct_blob; }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 60bc8984d81..fbeda7f1099 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -16,9 +16,9 @@ #include "compact_protocol_writer.hpp" -namespace cudf { -namespace io { -namespace parquet { +#include + +namespace cudf::io::parquet::detail { /** * @brief Parquet CompactProtocolWriter class @@ -48,13 +48,11 @@ size_t CompactProtocolWriter::write(DecimalType const& decimal) size_t CompactProtocolWriter::write(TimeUnit const& time_unit) { CompactProtocolFieldWriter c(*this); - auto const isset = time_unit.isset; - if (isset.MILLIS) { - c.field_struct(1, time_unit.MILLIS); - } else if (isset.MICROS) { - c.field_struct(2, time_unit.MICROS); - } else if (isset.NANOS) { - c.field_struct(3, time_unit.NANOS); + switch (time_unit.type) { + case TimeUnit::MILLIS: + case TimeUnit::MICROS: + case TimeUnit::NANOS: c.field_empty_struct(time_unit.type); break; + default: CUDF_FAIL("Trying to write an invalid TimeUnit " + std::to_string(time_unit.type)); } return c.value(); } @@ -86,31 +84,29 @@ size_t CompactProtocolWriter::write(IntType const& integer) size_t CompactProtocolWriter::write(LogicalType const& logical_type) { CompactProtocolFieldWriter c(*this); - auto const isset = logical_type.isset; - if (isset.STRING) { - c.field_struct(1, logical_type.STRING); - } else if (isset.MAP) { - c.field_struct(2, logical_type.MAP); - } else if (isset.LIST) { - c.field_struct(3, logical_type.LIST); - } else if (isset.ENUM) { - c.field_struct(4, logical_type.ENUM); - } else if (isset.DECIMAL) { - c.field_struct(5, logical_type.DECIMAL); - } else if (isset.DATE) { - c.field_struct(6, logical_type.DATE); - } else if (isset.TIME) { - c.field_struct(7, logical_type.TIME); - } else if (isset.TIMESTAMP) { - c.field_struct(8, logical_type.TIMESTAMP); - } else if (isset.INTEGER) { - c.field_struct(10, logical_type.INTEGER); - } else if (isset.UNKNOWN) { - c.field_struct(11, logical_type.UNKNOWN); - } else if (isset.JSON) { - c.field_struct(12, logical_type.JSON); - } else if (isset.BSON) { - c.field_struct(13, logical_type.BSON); + switch (logical_type.type) { + case LogicalType::STRING: + case LogicalType::MAP: + case LogicalType::LIST: + case LogicalType::ENUM: + case LogicalType::DATE: + case LogicalType::UNKNOWN: + case LogicalType::JSON: + case LogicalType::BSON: c.field_empty_struct(logical_type.type); break; + case LogicalType::DECIMAL: + c.field_struct(LogicalType::DECIMAL, logical_type.decimal_type.value()); + break; + case LogicalType::TIME: + c.field_struct(LogicalType::TIME, logical_type.time_type.value()); + break; + case LogicalType::TIMESTAMP: + c.field_struct(LogicalType::TIMESTAMP, logical_type.timestamp_type.value()); + break; + case LogicalType::INTEGER: + c.field_struct(LogicalType::INTEGER, logical_type.int_type.value()); + break; + default: + CUDF_FAIL("Trying to write an invalid LogicalType " + std::to_string(logical_type.type)); } return c.value(); } @@ -126,20 +122,15 @@ size_t CompactProtocolWriter::write(SchemaElement const& s) c.field_string(4, s.name); if (s.type == UNDEFINED_TYPE) { c.field_int(5, s.num_children); } - if (s.converted_type != UNKNOWN) { - c.field_int(6, s.converted_type); + if (s.converted_type.has_value()) { + c.field_int(6, s.converted_type.value()); if (s.converted_type == DECIMAL) { c.field_int(7, s.decimal_scale); c.field_int(8, s.decimal_precision); } } - if (s.field_id) { c.field_int(9, s.field_id.value()); } - auto const isset = s.logical_type.isset; - // TODO: add handling for all logical types - // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or - // isset.TIME or isset.TIMESTAMP or isset.INTEGER or isset.UNKNOWN or isset.JSON or isset.BSON) - // { - if (isset.TIMESTAMP or isset.TIME) { c.field_struct(10, s.logical_type); } + if (s.field_id.has_value()) { c.field_int(9, s.field_id.value()); } + if (s.logical_type.has_value()) { c.field_struct(10, s.logical_type.value()); } return c.value(); } @@ -197,12 +188,12 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s) size_t CompactProtocolWriter::write(Statistics const& s) { CompactProtocolFieldWriter c(*this); - if (not s.max.empty()) { c.field_binary(1, s.max); } - if (not s.min.empty()) { c.field_binary(2, s.min); } - if (s.null_count != -1) { c.field_int(3, s.null_count); } - if (s.distinct_count != -1) { c.field_int(4, s.distinct_count); } - if (not s.max_value.empty()) { c.field_binary(5, s.max_value); } - if (not s.min_value.empty()) { c.field_binary(6, s.min_value); } + if (s.max.has_value()) { c.field_binary(1, s.max.value()); } + if (s.min.has_value()) { c.field_binary(2, s.min.value()); } + if (s.null_count.has_value()) { c.field_int(3, s.null_count.value()); } + if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); } + if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); } + if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); } return c.value(); } @@ -225,9 +216,9 @@ size_t CompactProtocolWriter::write(OffsetIndex const& s) size_t CompactProtocolWriter::write(ColumnOrder const& co) { CompactProtocolFieldWriter c(*this); - switch (co) { - case ColumnOrder::TYPE_ORDER: c.field_empty_struct(1); break; - default: break; + switch (co.type) { + case ColumnOrder::TYPE_ORDER: c.field_empty_struct(co.type); break; + default: CUDF_FAIL("Trying to write an invalid ColumnOrder " + std::to_string(co.type)); } return c.value(); } @@ -391,6 +382,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field) current_field_value = field; } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 26d66527aa5..4849a814b14 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -25,9 +25,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata @@ -115,6 +113,4 @@ class CompactProtocolFieldWriter { inline void set_current_field(int const& field); }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 8de3702bc2e..544c93ee616 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 2382e4aafdf..ccc28791071 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -18,7 +18,7 @@ #include "page_decode.cuh" -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // DELTA_XXX encoding support // @@ -39,21 +39,15 @@ namespace cudf::io::parquet::gpu { // per mini-block. While encoding, the lowest delta value is subtracted from all the deltas in the // block to ensure that all encoded values are positive. The deltas for each mini-block are bit // packed using the same encoding as the RLE/Bit-Packing Hybrid encoder. -// -// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY -// columns. For each element in a sequence of strings, a prefix length from the preceding string -// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are -// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix -// lengths, followed by the concatenated suffix data. -// TODO: The delta encodings use ULEB128 integers, but for now we're only -// using max 64 bits. Need to see what the performance impact is of using -// __int128_t rather than int64_t. -using uleb128_t = uint64_t; -using zigzag128_t = int64_t; +// The largest mini-block size we can currently support. +constexpr int max_delta_mini_block_size = 64; -// we decode one mini-block at a time. max mini-block size seen is 64. -constexpr int delta_rolling_buf_size = 128; +// The first pass decodes `values_per_mb` values, and then the second pass does another +// batch of size `values_per_mb`. The largest value for values_per_miniblock among the +// major writers seems to be 64, so 2 * 64 should be good. We save the first value separately +// since it is not encoded in the first mini-block. +constexpr int delta_rolling_buf_size = 2 * max_delta_mini_block_size; /** * @brief Read a ULEB128 varint integer @@ -96,7 +90,8 @@ struct delta_binary_decoder { uleb128_t mini_block_count; // usually 4, chosen such that block_size/mini_block_count is a // multiple of 32 uleb128_t value_count; // total values encoded in the block - zigzag128_t last_value; // last value decoded, initialized to first_value from header + zigzag128_t first_value; // initial value, stored in the header + zigzag128_t last_value; // last value decoded uint32_t values_per_mb; // block_size / mini_block_count, must be multiple of 32 uint32_t current_value_idx; // current value index, initialized to 0 at start of block @@ -108,6 +103,13 @@ struct delta_binary_decoder { uleb128_t value[delta_rolling_buf_size]; // circular buffer of delta values + // returns the value stored in the `value` array at index + // `rolling_index(idx)`. If `idx` is `0`, then return `first_value`. + constexpr zigzag128_t value_at(size_type idx) + { + return idx == 0 ? first_value : value[rolling_index(idx)]; + } + // returns the number of values encoded in the block data. when all_values is true, // account for the first value in the header. otherwise just count the values encoded // in the mini-block data. @@ -151,7 +153,8 @@ struct delta_binary_decoder { block_size = get_uleb128(d_start, d_end); mini_block_count = get_uleb128(d_start, d_end); value_count = get_uleb128(d_start, d_end); - last_value = get_zz128(d_start, d_end); + first_value = get_zz128(d_start, d_end); + last_value = first_value; current_value_idx = 0; values_per_mb = block_size / mini_block_count; @@ -185,6 +188,28 @@ struct delta_binary_decoder { } } + // given start/end pointers in the data, find the end of the binary encoded block. when done, + // `this` will be initialized with the correct start and end positions. returns the end, which is + // start of data/next block. should only be called from thread 0. + inline __device__ uint8_t const* find_end_of_block(uint8_t const* start, uint8_t const* end) + { + // read block header + init_binary_block(start, end); + + // test for no encoded values. a single value will be in the block header. + if (value_count <= 1) { return block_start; } + + // read mini-block headers and skip over data + while (current_value_idx < num_encoded_values(false)) { + setup_next_mini_block(false); + } + // calculate the correct end of the block + auto const* const new_end = cur_mb == 0 ? block_start : cur_mb_start; + // re-init block with correct end + init_binary_block(start, new_end); + return new_end; + } + // decode the current mini-batch of deltas, and convert to values. // called by all threads in a warp, currently only one warp supported. inline __device__ void calc_mini_block_values(int lane_id) @@ -192,12 +217,9 @@ struct delta_binary_decoder { using cudf::detail::warp_size; if (current_value_idx >= value_count) { return; } - // need to save first value from header on first pass + // need to account for the first value from header on first pass if (current_value_idx == 0) { - if (lane_id == 0) { - current_value_idx++; - value[0] = last_value; - } + if (lane_id == 0) { current_value_idx++; } __syncwarp(); if (current_value_idx >= value_count) { return; } } @@ -291,4 +313,4 @@ struct delta_binary_decoder { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh new file mode 100644 index 00000000000..b0a7493fcab --- /dev/null +++ b/cpp/src/io/parquet/delta_enc.cuh @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "parquet_gpu.hpp" + +#include +#include + +#include + +namespace cudf::io::parquet::detail { + +namespace delta { + +inline __device__ void put_uleb128(uint8_t*& p, uleb128_t v) +{ + while (v > 0x7f) { + *(p++) = v | 0x80; + v >>= 7; + } + *(p++) = v; +} + +inline __device__ void put_zz128(uint8_t*& p, zigzag128_t v) +{ + zigzag128_t s = (v < 0); + put_uleb128(p, (v ^ -s) * 2 + s); +} + +// A block size of 128, with 4 mini-blocks of 32 values each fits nicely without consuming +// too much shared memory. +// The parquet spec requires block_size to be a multiple of 128, and values_per_mini_block +// to be a multiple of 32. +// TODO: if these are ever made configurable, be sure to fix the page size calculation in +// delta_data_len() (page_enc.cu). +constexpr int block_size = 128; +constexpr int num_mini_blocks = 4; +constexpr int values_per_mini_block = block_size / num_mini_blocks; +constexpr int buffer_size = 2 * block_size; + +// An extra sanity checks to enforce compliance with the parquet specification. +static_assert(block_size % 128 == 0); +static_assert(values_per_mini_block % 32 == 0); + +using block_reduce = cub::BlockReduce; +using warp_reduce = cub::WarpReduce; +using index_scan = cub::BlockScan; + +constexpr int rolling_idx(int index) { return rolling_index(index); } + +// Version of bit packer that can handle up to 64 bits values. +// T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long +// (not uint64_t because of atomicOr's typing). allowing this to be selectable since there's a +// measurable impact to using the wider types. +template +inline __device__ void bitpack_mini_block( + uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space) +{ + using wide_type = + std::conditional_t, __uint128_t, uint64_t>; + using cudf::detail::warp_size; + scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1; + auto constexpr div = sizeof(scratch_type) * 8; + + auto const lane_id = threadIdx.x % warp_size; + auto const warp_id = threadIdx.x / warp_size; + + auto const scratch = reinterpret_cast(temp_space) + warp_id * warp_size; + + // zero out scratch + scratch[lane_id] = 0; + __syncwarp(); + + // TODO: see if there is any savings using special packing for easy bitwidths (1,2,4,8,16...) + // like what's done for the RLE encoder. + if (nbits == div) { + if (lane_id < count) { + for (int i = 0; i < sizeof(scratch_type); i++) { + dst[lane_id * sizeof(scratch_type) + i] = val & 0xff; + val >>= 8; + } + } + return; + } + + if (lane_id <= count) { + // Shift symbol left by up to mask bits. + wide_type v2 = val; + v2 <<= (lane_id * nbits) & mask; + + // Copy N bit word into two N/2 bit words while following C++ strict aliasing rules. + scratch_type v1[2]; + memcpy(&v1, &v2, sizeof(wide_type)); + + // Atomically write result to scratch. + if (v1[0]) { atomicOr(scratch + ((lane_id * nbits) / div), v1[0]); } + if (v1[1]) { atomicOr(scratch + ((lane_id * nbits) / div) + 1, v1[1]); } + } + __syncwarp(); + + // Copy scratch data to final destination. + auto const available_bytes = util::div_rounding_up_safe(count * nbits, 8U); + auto const scratch_bytes = reinterpret_cast(scratch); + + for (uint32_t i = lane_id; i < available_bytes; i += warp_size) { + dst[i] = scratch_bytes[i]; + } + __syncwarp(); +} + +} // namespace delta + +// Object used to turn a stream of integers into a DELTA_BINARY_PACKED stream. This takes as input +// 128 values with validity at a time, saving them until there are enough values for a block +// to be written. +// T is the input data type (either zigzag128_t or uleb128_t). +template +class delta_binary_packer { + private: + uint8_t* _dst; // sink to dump encoded values to + T* _buffer; // buffer to store values to be encoded + size_type _current_idx; // index of first value in buffer + uint32_t _num_values; // total number of values to encode + size_type _values_in_buffer; // current number of values stored in _buffer + uint8_t _mb_bits[delta::num_mini_blocks]; // bitwidth for each mini-block + + // pointers to shared scratch memory for the warp and block scans/reduces + delta::index_scan::TempStorage* _scan_tmp; + delta::warp_reduce::TempStorage* _warp_tmp; + delta::block_reduce::TempStorage* _block_tmp; + + void* _bitpack_tmp; // pointer to shared scratch memory used in bitpacking + + // Write the delta binary header. Only call from thread 0. + inline __device__ void write_header() + { + delta::put_uleb128(_dst, delta::block_size); + delta::put_uleb128(_dst, delta::num_mini_blocks); + delta::put_uleb128(_dst, _num_values); + delta::put_zz128(_dst, _buffer[0]); + } + + // Write the block header. Only call from thread 0. + inline __device__ void write_block_header(zigzag128_t block_min) + { + delta::put_zz128(_dst, block_min); + memcpy(_dst, _mb_bits, 4); + _dst += 4; + } + + // Signed subtraction with defined wrapping behavior. + inline __device__ zigzag128_t subtract(zigzag128_t a, zigzag128_t b) + { + return static_cast(static_cast(a) - static_cast(b)); + } + + public: + inline __device__ auto num_values() const { return _num_values; } + + // Initialize the object. Only call from thread 0. + inline __device__ void init(uint8_t* dest, uint32_t num_values, T* buffer, void* temp_storage) + { + _dst = dest; + _num_values = num_values; + _buffer = buffer; + _scan_tmp = reinterpret_cast(temp_storage); + _warp_tmp = reinterpret_cast(temp_storage); + _block_tmp = reinterpret_cast(temp_storage); + _bitpack_tmp = _buffer + delta::buffer_size; + _current_idx = 0; + _values_in_buffer = 0; + } + + // Each thread calls this to add its current value. + inline __device__ void add_value(T value, bool is_valid) + { + // Figure out the correct position for the given value. + size_type const valid = is_valid; + size_type pos; + size_type num_valid; + delta::index_scan(*_scan_tmp).ExclusiveSum(valid, pos, num_valid); + + if (is_valid) { _buffer[delta::rolling_idx(pos + _current_idx + _values_in_buffer)] = value; } + __syncthreads(); + + if (threadIdx.x == 0) { + _values_in_buffer += num_valid; + // if first pass write header + if (_current_idx == 0) { + write_header(); + _current_idx = 1; + _values_in_buffer -= 1; + } + } + __syncthreads(); + + if (_values_in_buffer >= delta::block_size) { flush(); } + } + + // Called by each thread to flush data to the sink. + inline __device__ uint8_t const* flush() + { + using cudf::detail::warp_size; + __shared__ zigzag128_t block_min; + + int const t = threadIdx.x; + int const warp_id = t / warp_size; + int const lane_id = t % warp_size; + + if (_values_in_buffer <= 0) { return _dst; } + + // Calculate delta for this thread. + size_type const idx = _current_idx + t; + zigzag128_t const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)], + _buffer[delta::rolling_idx(idx - 1)]) + : std::numeric_limits::max(); + + // Find min delta for the block. + auto const min_delta = delta::block_reduce(*_block_tmp).Reduce(delta, cub::Min()); + + if (t == 0) { block_min = min_delta; } + __syncthreads(); + + // Compute frame of reference for the block. + uleb128_t const norm_delta = idx < _num_values ? subtract(delta, block_min) : 0; + + // Get max normalized delta for each warp, and use that to determine how many bits to use + // for the bitpacking of this warp. + zigzag128_t const warp_max = + delta::warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max()); + __syncwarp(); + + if (lane_id == 0) { _mb_bits[warp_id] = sizeof(zigzag128_t) * 8 - __clzll(warp_max); } + __syncthreads(); + + // write block header + if (t == 0) { write_block_header(block_min); } + __syncthreads(); + + // Now each warp encodes its data...can calculate starting offset with _mb_bits. + // NOTE: using a switch here rather than a loop because the compiler produces code that + // uses fewer registers. + int cumulative_bits = 0; + switch (warp_id) { + case 3: cumulative_bits += _mb_bits[2]; [[fallthrough]]; + case 2: cumulative_bits += _mb_bits[1]; [[fallthrough]]; + case 1: cumulative_bits += _mb_bits[0]; + } + uint8_t* const mb_ptr = _dst + cumulative_bits * delta::values_per_mini_block / 8; + + // encoding happens here + auto const warp_idx = _current_idx + warp_id * delta::values_per_mini_block; + if (warp_idx < _num_values) { + auto const num_enc = min(delta::values_per_mini_block, _num_values - warp_idx); + if (_mb_bits[warp_id] > 32) { + delta::bitpack_mini_block( + mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp); + } else { + delta::bitpack_mini_block( + mb_ptr, norm_delta, num_enc, _mb_bits[warp_id], _bitpack_tmp); + } + } + __syncthreads(); + + // Last warp updates global delta ptr. + if (warp_id == delta::num_mini_blocks - 1 && lane_id == 0) { + _dst = mb_ptr + _mb_bits[warp_id] * delta::values_per_mini_block / 8; + _current_idx = min(warp_idx + delta::values_per_mini_block, _num_values); + _values_in_buffer = max(_values_in_buffer - delta::block_size, 0U); + } + __syncthreads(); + + return _dst; + } +}; + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp new file mode 100644 index 00000000000..92b5eebe9fd --- /dev/null +++ b/cpp/src/io/parquet/error.hpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cudf::io::parquet { + +/** + * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in + * kernel calls. + * + * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout + * the object's lifetime. + */ +class kernel_error { + private: + rmm::device_scalar _error_code; + + public: + /** + * @brief Construct a new `kernel_error` with an initial value of 0. + * + * Note: the initial value is set asynchronously. + * + * @throws `rmm::bad_alloc` if allocating the device memory for `initial_value` fails. + * @throws `rmm::cuda_error` if copying `initial_value` to device memory fails. + * + * @param CUDA stream to use + */ + kernel_error(rmm::cuda_stream_view stream) : _error_code{0, stream} {} + + /** + * @brief Return a pointer to the device memory for the error + */ + [[nodiscard]] auto data() { return _error_code.data(); } + + /** + * @brief Return the current value of the error + * + * This uses the stream used to create this instance. This does a synchronize on the stream + * this object was instantiated with. + */ + [[nodiscard]] auto value() const { return _error_code.value(_error_code.stream()); } + + /** + * @brief Return a hexadecimal string representation of the current error code + * + * Returned string will have "0x" prepended. + */ + [[nodiscard]] std::string str() const + { + std::stringstream sstream; + sstream << std::hex << value(); + return "0x" + sstream.str(); + } +}; + +} // namespace cudf::io::parquet diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 230834632dd..0c53877f7c7 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -452,8 +449,13 @@ __global__ void __launch_bounds__(decode_block_size) int out_thread0; [[maybe_unused]] null_count_back_copier _{s, t}; - if (!setupLocalPageInfo( - s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_GENERAL}, true)) { + if (!setupLocalPageInfo(s, + &pages[page_idx], + chunks, + min_row, + num_rows, + mask_filter{decode_kernel_mask::GENERAL}, + true)) { return; } @@ -489,6 +491,7 @@ __global__ void __launch_bounds__(decode_block_size) target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0); if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); } } + // TODO(ets): see if this sync can be removed __syncthreads(); if (t < 32) { // decode repetition and definition levels. @@ -602,14 +605,11 @@ __global__ void __launch_bounds__(decode_block_size) } __syncthreads(); } - if (t == 0 and s->error != 0) { - cuda::atomic_ref ref{*error_code}; - ref.fetch_or(s->error, cuda::std::memory_order_relaxed); - } + if (t == 0 and s->error != 0) { set_error(s->error, error_code); } } struct mask_tform { - __device__ uint32_t operator()(PageInfo const& p) { return p.kernel_mask; } + __device__ uint32_t operator()(PageInfo const& p) { return static_cast(p.kernel_mask); } }; } // anonymous namespace @@ -624,7 +624,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector } /** - * @copydoc cudf::io::parquet::gpu::DecodePageData + * @copydoc cudf::io::parquet::detail::DecodePageData */ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -648,7 +648,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index cdc29197eb3..4db9bd3904b 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -24,7 +24,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { struct page_state_s { constexpr page_state_s() noexcept {} @@ -753,7 +753,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // for nested schemas, it's more complicated. This warp will visit 32 incoming values, // however not all of them will necessarily represent a value at this nesting level. so // the validity bit for thread t might actually represent output value t-6. the correct - // position for thread t's bit is cur_value_count. for cuda 11 we could use + // position for thread t's bit is thread_value_count. for cuda 11 we could use // __reduce_or_sync(), but until then we have to do a warp reduce. WarpReduceOr32(is_valid << thread_value_count); @@ -991,8 +991,15 @@ struct all_types_filter { * @brief Functor for setupLocalPageInfo that takes a mask of allowed types. */ struct mask_filter { - int mask; - __device__ inline bool operator()(PageInfo const& page) { return (page.kernel_mask & mask) != 0; } + uint32_t mask; + + __device__ mask_filter(uint32_t m) : mask(m) {} + __device__ mask_filter(decode_kernel_mask m) : mask(static_cast(m)) {} + + __device__ inline bool operator()(PageInfo const& page) + { + return BitAnd(mask, page.kernel_mask) != 0; + } }; /** @@ -1143,7 +1150,8 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, units = cudf::timestamp_ms::period::den; } else if (s->col.converted_type == TIMESTAMP_MICROS) { units = cudf::timestamp_us::period::den; - } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) { + } else if (s->col.logical_type.has_value() and + s->col.logical_type->is_timestamp_nanos()) { units = cudf::timestamp_ns::period::den; } if (units and units != s->col.ts_clock_rate) { @@ -1305,6 +1313,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, s->dict_run = 0; } break; case Encoding::DELTA_BINARY_PACKED: + case Encoding::DELTA_BYTE_ARRAY: // nothing to do, just don't error break; default: { @@ -1384,4 +1393,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, return true; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 2b78dead205..bc025c6fc3e 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -23,10 +23,281 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { namespace { +constexpr int decode_block_size = 128; + +// DELTA_BYTE_ARRAY encoding (incremental encoding or front compression), is used for BYTE_ARRAY +// columns. For each element in a sequence of strings, a prefix length from the preceding string +// and a suffix is stored. The prefix lengths are DELTA_BINARY_PACKED encoded. The suffixes are +// encoded with DELTA_LENGTH_BYTE_ARRAY encoding, which is a DELTA_BINARY_PACKED list of suffix +// lengths, followed by the concatenated suffix data. +struct delta_byte_array_decoder { + uint8_t const* last_string; // pointer to last decoded string...needed for its prefix + uint8_t const* suffix_char_data; // pointer to the start of character data + + uint8_t* temp_buf; // buffer used when skipping values + uint32_t start_val; // decoded strings up to this index will be dumped to temp_buf + uint32_t last_string_len; // length of the last decoded string + + delta_binary_decoder prefixes; // state of decoder for prefix lengths + delta_binary_decoder suffixes; // state of decoder for suffix lengths + + // initialize the prefixes and suffixes blocks + __device__ void init(uint8_t const* start, uint8_t const* end, uint32_t start_idx, uint8_t* temp) + { + auto const* suffix_start = prefixes.find_end_of_block(start, end); + suffix_char_data = suffixes.find_end_of_block(suffix_start, end); + last_string = nullptr; + temp_buf = temp; + start_val = start_idx; + } + + // kind of like an inclusive scan for strings. takes prefix_len bytes from preceding + // string and prepends to the suffix we've already copied into place. called from + // within loop over values_in_mb, so this only needs to handle a single warp worth of data + // at a time. + __device__ void string_scan(uint8_t* strings_out, + uint8_t const* last_string, + uint32_t start_idx, + uint32_t end_idx, + uint32_t offset, + uint32_t lane_id) + { + using cudf::detail::warp_size; + + // let p(n) === length(prefix(string_n)) + // + // if p(n-1) > p(n), then string_n can be completed when string_n-2 is completed. likewise if + // p(m) > p(n), then string_n can be completed with string_m-1. however, if p(m) < p(n), then m + // is a "blocker" for string_n; string_n can be completed only after string_m is. + // + // we will calculate the nearest blocking position for each lane, and then fill in string_0. we + // then iterate, finding all lanes that have had their "blocker" filled in and completing them. + // when all lanes are filled in, we return. this will still hit the worst case if p(n-1) < p(n) + // for all n + __shared__ __align__(8) int64_t prefix_lens[warp_size]; + __shared__ __align__(8) uint8_t const* offsets[warp_size]; + + uint32_t const ln_idx = start_idx + lane_id; + uint64_t prefix_len = ln_idx < end_idx ? prefixes.value_at(ln_idx) : 0; + uint8_t* const lane_out = ln_idx < end_idx ? strings_out + offset : nullptr; + + prefix_lens[lane_id] = prefix_len; + offsets[lane_id] = lane_out; + + // if all prefix_len's are zero, then there's nothing to do + if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; } + + // find a neighbor to the left that has a prefix length less than this lane. once that + // neighbor is complete, this lane can be completed. + int blocker = lane_id - 1; + while (blocker > 0 && prefix_lens[blocker] != 0 && prefix_len <= prefix_lens[blocker]) { + blocker--; + } + + // fill in lane 0 (if necessary) + if (lane_id == 0 && prefix_len > 0) { + memcpy(lane_out, last_string, prefix_len); + prefix_lens[0] = prefix_len = 0; + } + __syncwarp(); + + // now fill in blockers until done + for (uint32_t i = 1; i < warp_size && i + start_idx < end_idx; i++) { + if (prefix_len != 0 && prefix_lens[blocker] == 0 && lane_out != nullptr) { + memcpy(lane_out, offsets[blocker], prefix_len); + prefix_lens[lane_id] = prefix_len = 0; + } + + // check for finished + if (__all_sync(0xffff'ffff, prefix_len == 0)) { return; } + } + } + + // calculate a mini-batch of string values, writing the results to + // `strings_out`. starting at global index `start_idx` and decoding + // up to `num_values` strings. + // called by all threads in a warp. used for strings <= 32 chars. + // returns number of bytes written + __device__ size_t calculate_string_values(uint8_t* strings_out, + uint32_t start_idx, + uint32_t num_values, + uint32_t lane_id) + { + using cudf::detail::warp_size; + using WarpScan = cub::WarpScan; + __shared__ WarpScan::TempStorage scan_temp; + + if (start_idx >= suffixes.value_count) { return 0; } + auto end_idx = start_idx + min(suffixes.values_per_mb, num_values); + end_idx = min(end_idx, static_cast(suffixes.value_count)); + + auto p_strings_out = strings_out; + auto p_temp_out = temp_buf; + + auto copy_batch = [&](uint8_t* out, uint32_t idx, uint32_t end) { + uint32_t const ln_idx = idx + lane_id; + + // calculate offsets into suffix data + uint64_t const suffix_len = ln_idx < end ? suffixes.value_at(ln_idx) : 0; + uint64_t suffix_off = 0; + WarpScan(scan_temp).ExclusiveSum(suffix_len, suffix_off); + + // calculate offsets into string data + uint64_t const prefix_len = ln_idx < end ? prefixes.value_at(ln_idx) : 0; + uint64_t const string_len = prefix_len + suffix_len; + + // get offset into output for each lane + uint64_t string_off, warp_total; + WarpScan(scan_temp).ExclusiveSum(string_len, string_off, warp_total); + auto const so_ptr = out + string_off; + + // copy suffixes into string data + if (ln_idx < end) { memcpy(so_ptr + prefix_len, suffix_char_data + suffix_off, suffix_len); } + __syncwarp(); + + // copy prefixes into string data. + string_scan(out, last_string, idx, end, string_off, lane_id); + + // save the position of the last computed string. this will be used in + // the next iteration to reconstruct the string in lane 0. + if (ln_idx == end - 1 || (ln_idx < end && lane_id == 31)) { + // set last_string to this lane's string + last_string = out + string_off; + last_string_len = string_len; + // and consume used suffix_char_data + suffix_char_data += suffix_off + suffix_len; + } + + return warp_total; + }; + + uint64_t string_total = 0; + for (int idx = start_idx; idx < end_idx; idx += warp_size) { + auto const n_in_batch = min(warp_size, end_idx - idx); + // account for the case where start_val occurs in the middle of this batch + if (idx < start_val && idx + n_in_batch > start_val) { + // dump idx...start_val into temp_buf + copy_batch(p_temp_out, idx, start_val); + __syncwarp(); + + // start_val...idx + n_in_batch into strings_out + auto nbytes = copy_batch(p_strings_out, start_val, idx + n_in_batch); + p_strings_out += nbytes; + string_total = nbytes; + } else { + if (idx < start_val) { + p_temp_out += copy_batch(p_temp_out, idx, end_idx); + } else { + auto nbytes = copy_batch(p_strings_out, idx, end_idx); + p_strings_out += nbytes; + string_total += nbytes; + } + } + __syncwarp(); + } + + return string_total; + } + + // character parallel version of CalculateStringValues(). This is faster for strings longer than + // 32 chars. + __device__ size_t calculate_string_values_cp(uint8_t* strings_out, + uint32_t start_idx, + uint32_t num_values, + uint32_t lane_id) + { + using cudf::detail::warp_size; + __shared__ __align__(8) uint8_t* so_ptr; + + if (start_idx >= suffixes.value_count) { return; } + auto end_idx = start_idx + min(suffixes.values_per_mb, num_values); + end_idx = min(end_idx, static_cast(suffixes.value_count)); + + if (lane_id == 0) { so_ptr = start_idx < start_val ? temp_buf : strings_out; } + __syncwarp(); + + uint64_t string_total = 0; + for (int idx = start_idx; idx < end_idx; idx++) { + uint64_t const suffix_len = suffixes.value_at(idx); + uint64_t const prefix_len = prefixes.value_at(idx); + uint64_t const string_len = prefix_len + suffix_len; + + // copy prefix and suffix data into current strings_out position + // for longer strings use a 4-byte version stolen from gather_chars_fn_string_parallel. + if (string_len > 64) { + if (prefix_len > 0) { wideStrcpy(so_ptr, last_string, prefix_len, lane_id); } + if (suffix_len > 0) { + wideStrcpy(so_ptr + prefix_len, suffix_char_data, suffix_len, lane_id); + } + } else { + for (int i = lane_id; i < string_len; i += warp_size) { + so_ptr[i] = i < prefix_len ? last_string[i] : suffix_char_data[i - prefix_len]; + } + } + __syncwarp(); + + if (idx >= start_val) { string_total += string_len; } + + if (lane_id == 0) { + last_string = so_ptr; + last_string_len = string_len; + suffix_char_data += suffix_len; + if (idx == start_val - 1) { + so_ptr = strings_out; + } else { + so_ptr += string_len; + } + } + __syncwarp(); + } + + return string_total; + } + + // dump strings before start_val to temp buf + __device__ void skip(bool use_char_ll) + { + using cudf::detail::warp_size; + int const t = threadIdx.x; + int const lane_id = t % warp_size; + + // is this even necessary? return if asking to skip the whole block. + if (start_val >= prefixes.num_encoded_values(true)) { return; } + + // prefixes and suffixes will have the same parameters (it's checked earlier) + auto const batch_size = prefixes.values_per_mb; + + uint32_t skip_pos = 0; + while (prefixes.current_value_idx < start_val) { + // warp 0 gets prefixes and warp 1 gets suffixes + auto* const db = t < 32 ? &prefixes : &suffixes; + + // this will potentially decode past start_val, but that's ok + if (t < 64) { db->decode_batch(); } + __syncthreads(); + + // warp 0 decodes the batch. + if (t < 32) { + auto const num_to_decode = min(batch_size, start_val - skip_pos); + auto const bytes_written = + use_char_ll ? calculate_string_values_cp(temp_buf, skip_pos, num_to_decode, lane_id) + : calculate_string_values(temp_buf, skip_pos, num_to_decode, lane_id); + // store last_string someplace safe in temp buffer + if (t == 0) { + memcpy(temp_buf + bytes_written, last_string, last_string_len); + last_string = temp_buf + bytes_written; + } + } + skip_pos += prefixes.values_per_mb; + __syncthreads(); + } + } +}; + // Decode page data that is DELTA_BINARY_PACKED encoded. This encoding is // only used for int32 and int64 physical types (and appears to only be used // with V2 page headers; see https://www.mail-archive.com/dev@parquet.apache.org/msg11826.html). @@ -52,13 +323,9 @@ __global__ void __launch_bounds__(96) auto* const db = &db_state; [[maybe_unused]] null_count_back_copier _{s, t}; - if (!setupLocalPageInfo(s, - &pages[page_idx], - chunks, - min_row, - num_rows, - mask_filter{KERNEL_MASK_DELTA_BINARY}, - true)) { + auto const mask = decode_kernel_mask::DELTA_BINARY; + if (!setupLocalPageInfo( + s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) { return; } @@ -78,6 +345,10 @@ __global__ void __launch_bounds__(96) __syncthreads(); auto const batch_size = db->values_per_mb; + if (batch_size > max_delta_mini_block_size) { + set_error(static_cast(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code); + return; + } // if skipped_leaf_values is non-zero, then we need to decode up to the first mini-block // that has a value we need. @@ -93,6 +364,7 @@ __global__ void __launch_bounds__(96) } else { // warp2 target_pos = min(s->nz_count, src_pos + batch_size); } + // TODO(ets): see if this sync can be removed __syncthreads(); // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of deltas. @@ -125,23 +397,12 @@ __global__ void __launch_bounds__(96) // place value for this thread if (dst_pos >= 0 && sp < target_pos) { void* const dst = nesting_info_base[leaf_level_index].data_out + dst_pos * s->dtype_len; + auto const val = db->value_at(sp + skipped_leaf_values); switch (s->dtype_len) { - case 1: - *static_cast(dst) = - db->value[rolling_index(sp + skipped_leaf_values)]; - break; - case 2: - *static_cast(dst) = - db->value[rolling_index(sp + skipped_leaf_values)]; - break; - case 4: - *static_cast(dst) = - db->value[rolling_index(sp + skipped_leaf_values)]; - break; - case 8: - *static_cast(dst) = - db->value[rolling_index(sp + skipped_leaf_values)]; - break; + case 1: *static_cast(dst) = val; break; + case 2: *static_cast(dst) = val; break; + case 4: *static_cast(dst) = val; break; + case 8: *static_cast(dst) = val; break; } } } @@ -151,6 +412,161 @@ __global__ void __launch_bounds__(96) __syncthreads(); } + if (t == 0 and s->error != 0) { set_error(s->error, error_code); } +} + +// Decode page data that is DELTA_BYTE_ARRAY packed. This encoding consists of a DELTA_BINARY_PACKED +// array of prefix lengths, followed by a DELTA_BINARY_PACKED array of suffix lengths, followed by +// the suffixes (technically the suffixes are DELTA_LENGTH_BYTE_ARRAY encoded). The latter two can +// be used to create an offsets array for the suffix data, but then this needs to be combined with +// the prefix lengths to do the final decode for each value. Because the lengths of the prefixes and +// suffixes are not encoded in the header, we're going to have to first do a quick pass through them +// to find the start/end of each structure. +template +__global__ void __launch_bounds__(decode_block_size) + gpuDecodeDeltaByteArray(PageInfo* pages, + device_span chunks, + size_t min_row, + size_t num_rows, + int32_t* error_code) +{ + using cudf::detail::warp_size; + __shared__ __align__(16) delta_byte_array_decoder db_state; + __shared__ __align__(16) page_state_s state_g; + __shared__ __align__(16) page_state_buffers_s state_buffers; + + page_state_s* const s = &state_g; + auto* const sb = &state_buffers; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; + int const lane_id = t % warp_size; + auto* const prefix_db = &db_state.prefixes; + auto* const suffix_db = &db_state.suffixes; + auto* const dba = &db_state; + [[maybe_unused]] null_count_back_copier _{s, t}; + + auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY; + if (!setupLocalPageInfo( + s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) { + return; + } + + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + + // choose a character parallel string copy when the average string is longer than a warp + auto const use_char_ll = (s->page.str_bytes / s->page.num_valids) > cudf::detail::warp_size; + + // copying logic from gpuDecodePageData. + PageNestingDecodeInfo const* nesting_info_base = s->nesting_info; + + __shared__ level_t rep[delta_rolling_buf_size]; // circular buffer of repetition level values + __shared__ level_t def[delta_rolling_buf_size]; // circular buffer of definition level values + + // skipped_leaf_values will always be 0 for flat hierarchies. + uint32_t const skipped_leaf_values = s->page.skipped_leaf_values; + + if (t == 0) { + // initialize the prefixes and suffixes blocks + dba->init(s->data_start, s->data_end, s->page.start_val, s->page.temp_string_buf); + } + __syncthreads(); + + // assert that prefix and suffix have same mini-block size + if (prefix_db->values_per_mb != suffix_db->values_per_mb or + prefix_db->block_size != suffix_db->block_size or + prefix_db->value_count != suffix_db->value_count) { + set_error(static_cast(decode_error::DELTA_PARAM_MISMATCH), error_code); + return; + } + + // pointer to location to output final strings + int const leaf_level_index = s->col.max_nesting_depth - 1; + auto strings_data = nesting_info_base[leaf_level_index].string_out; + + auto const batch_size = prefix_db->values_per_mb; + if (batch_size > max_delta_mini_block_size) { + set_error(static_cast(decode_error::DELTA_PARAMS_UNSUPPORTED), error_code); + return; + } + + // if this is a bounds page and nested, then we need to skip up front. non-nested will work + // its way through the page. + int string_pos = has_repetition ? s->page.start_val : 0; + auto const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition); + if (is_bounds_pg && string_pos > 0) { dba->skip(use_char_ll); } + + while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) { + uint32_t target_pos; + uint32_t const src_pos = s->src_pos; + + if (t < 3 * warp_size) { // warp 0..2 + target_pos = min(src_pos + 2 * batch_size, s->nz_count + s->first_row + batch_size); + } else { // warp 3 + target_pos = min(s->nz_count, src_pos + batch_size); + } + // TODO(ets): see if this sync can be removed + __syncthreads(); + + // warp0 will decode the rep/def levels, warp1 will unpack a mini-batch of prefixes, warp 2 will + // unpack a mini-batch of suffixes. warp3 waits one cycle for warps 0-2 to produce a batch, and + // then stuffs values into the proper location in the output. + if (t < warp_size) { + // decode repetition and definition levels. + // - update validity vectors + // - updates offsets (for nested columns) + // - produces non-NULL value indices in s->nz_idx for subsequent decoding + gpuDecodeLevels(s, sb, target_pos, rep, def, t); + + } else if (t < 2 * warp_size) { + // warp 1 + prefix_db->decode_batch(); + + } else if (t < 3 * warp_size) { + // warp 2 + suffix_db->decode_batch(); + + } else if (src_pos < target_pos) { + // warp 3 + + int const nproc = min(batch_size, s->page.end_val - string_pos); + strings_data += use_char_ll + ? dba->calculate_string_values_cp(strings_data, string_pos, nproc, lane_id) + : dba->calculate_string_values(strings_data, string_pos, nproc, lane_id); + string_pos += nproc; + + // process the mini-block in batches of 32 + for (uint32_t sp = src_pos + lane_id; sp < src_pos + batch_size; sp += 32) { + // the position in the output column/buffer + int dst_pos = sb->nz_idx[rolling_index(sp)]; + + // handle skip_rows here. flat hierarchies can just skip up to first_row. + if (!has_repetition) { dst_pos -= s->first_row; } + + if (dst_pos >= 0 && sp < target_pos) { + auto const offptr = + reinterpret_cast(nesting_info_base[leaf_level_index].data_out) + dst_pos; + auto const src_idx = sp + skipped_leaf_values; + *offptr = prefix_db->value_at(src_idx) + suffix_db->value_at(src_idx); + } + __syncwarp(); + } + + if (lane_id == 0) { s->src_pos = src_pos + batch_size; } + } + + __syncthreads(); + } + + // now turn array of lengths into offsets + int value_count = nesting_info_base[leaf_level_index].value_count; + + // if no repetition we haven't calculated start/end bounds and instead just skipped + // values until we reach first_row. account for that here. + if (!has_repetition) { value_count -= s->first_row; } + + auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); + block_excl_sum(offptr, value_count, s->page.str_offset); + if (t == 0 and s->error != 0) { cuda::atomic_ref ref{*error_code}; ref.fetch_or(s->error, cuda::std::memory_order_relaxed); @@ -160,7 +576,7 @@ __global__ void __launch_bounds__(96) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary + * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary */ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -184,4 +600,29 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages } } -} // namespace cudf::io::parquet::gpu +/** + * @copydoc cudf::io::parquet::gpu::DecodeDeltaByteArray + */ +void __host__ DecodeDeltaByteArray(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + int32_t* error_code, + rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); + + dim3 const dim_block(decode_block_size, 1); + dim3 const dim_grid(pages.size(), 1); // 1 threadblock per page + + if (level_type_size == 1) { + gpuDecodeDeltaByteArray<<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } else { + gpuDecodeDeltaByteArray<<>>( + pages.device_ptr(), chunks, min_row, num_rows, error_code); + } +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fe0dbb85124..2b7980c93e9 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "delta_enc.cuh" #include "parquet_gpu.cuh" #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include @@ -41,16 +43,19 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +#include + +namespace cudf::io::parquet::detail { namespace { using ::cudf::detail::device_2dspan; -constexpr uint32_t rle_buffer_size = (1 << 9); +constexpr int encode_block_size = 128; +constexpr int rle_buffer_size = 2 * encode_block_size; +constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; + +constexpr int rolling_idx(int pos) { return rolling_index(pos); } // do not truncate statistics constexpr int32_t NO_TRUNC_STATS = 0; @@ -72,6 +77,7 @@ struct frag_init_state_s { PageFragment frag; }; +template struct page_enc_state_s { uint8_t* cur; //!< current output ptr uint8_t* rle_out; //!< current RLE write ptr @@ -84,14 +90,15 @@ struct page_enc_state_s { uint32_t rle_rpt_count; uint32_t page_start_val; uint32_t chunk_start_val; - volatile uint32_t rpt_map[4]; - volatile uint32_t scratch_red[32]; + volatile uint32_t rpt_map[num_encode_warps]; EncPage page; EncColumnChunk ck; parquet_column_device_view col; - uint32_t vals[rle_buffer_size]; + uint32_t vals[rle_buf_size]; }; +using rle_page_enc_state_s = page_enc_state_s; + /** * @brief Returns the size of the type in the Parquet file. */ @@ -208,6 +215,12 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t) } } +/** + * @brief Determine the correct page encoding for the given page parameters. + * + * This is only used by the plain and dictionary encoders. Delta encoders will set the page + * encoding directly. + */ Encoding __device__ determine_encoding(PageType page_type, Type physical_type, bool use_dictionary, @@ -219,7 +232,6 @@ Encoding __device__ determine_encoding(PageType page_type, switch (page_type) { case PageType::DATA_PAGE: return use_dictionary ? Encoding::PLAIN_DICTIONARY : Encoding::PLAIN; case PageType::DATA_PAGE_V2: - // TODO need to work in delta encodings here when they're added return physical_type == BOOLEAN ? Encoding::RLE : use_dictionary ? Encoding::RLE_DICTIONARY : Encoding::PLAIN; @@ -239,6 +251,50 @@ struct BitwiseOr { } }; +// I is the column type from the input table +template +__device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, + uint32_t valid_count, + uint64_t* buffer, + void* temp_space) +{ + using output_type = std::conditional_t, zigzag128_t, uleb128_t>; + __shared__ delta_binary_packer packer; + + auto const t = threadIdx.x; + if (t == 0) { + packer.init(s->cur, valid_count, reinterpret_cast(buffer), temp_space); + } + __syncthreads(); + + // TODO(ets): in the plain encoder the scaling is a little different for INT32 than INT64. + // might need to modify this if there's a big performance hit in the 32-bit case. + int32_t const scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, delta::block_size); + + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx = s->page_start_val + val_idx_in_block; + + bool const is_valid = + (val_idx < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values) + ? s->col.leaf_column->is_valid(val_idx) + : false; + + cur_val_idx += nvals; + + output_type v = is_valid ? s->col.leaf_column->element(val_idx) : 0; + if (scale < 0) { + v /= -scale; + } else { + v *= scale; + } + packer.add_value(v, is_valid); + } + + return packer.flush(); +} + } // anonymous namespace // blockDim {512,1,1} @@ -326,10 +382,40 @@ __global__ void __launch_bounds__(128) } } +__device__ size_t delta_data_len(Type physical_type, cudf::type_id type_id, uint32_t num_values) +{ + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + auto const vals_per_block = delta::block_size; + size_t const num_blocks = util::div_rounding_up_unsafe(num_values, vals_per_block); + // need max dtype_len + 1 bytes for min_delta (because we only encode 7 bits per byte) + // one byte per mini block for the bitwidth + auto const mini_block_header_size = dtype_len + 1 + delta::num_mini_blocks; + // each encoded value can be at most sizeof(type) * 8 + 1 bits + auto const max_bits = dtype_len * 8 + 1; + // each data block will then be max_bits * values per block. vals_per_block is guaranteed to be + // divisible by 128 (via static assert on delta::block_size), but do safe division anyway. + auto const bytes_per_block = cudf::util::div_rounding_up_unsafe(max_bits * vals_per_block, 8); + auto const block_size = mini_block_header_size + bytes_per_block; + + // delta header is 2 bytes for the block_size, 1 byte for number of mini-blocks, + // max 5 bytes for number of values, and max dtype_len + 1 for first value. + // TODO: if we ever allow configurable block sizes then this calculation will need to be + // modified. + auto const header_size = 2 + 1 + 5 + dtype_len + 1; + + return header_size + num_blocks * block_size; +} + // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -357,6 +443,14 @@ __global__ void __launch_bounds__(128) page_g = {}; } __syncthreads(); + + // if writing delta encoded values, we're going to need to know the data length to get a guess + // at the worst case number of bytes needed to encode. + auto const physical_type = col_g.physical_type; + auto const type_id = col_g.leaf_column->type().id(); + auto const is_use_delta = + write_v2_headers && !ck_g.use_dictionary && (physical_type == INT32 || physical_type == INT64); + if (t < 32) { uint32_t fragments_in_chunk = 0; uint32_t rows_in_page = 0; @@ -406,9 +500,12 @@ __global__ void __launch_bounds__(128) } __syncwarp(); if (t == 0) { - if (not pages.empty()) pages[ck_g.first_page] = page_g; - if (not page_sizes.empty()) page_sizes[ck_g.first_page] = page_g.max_data_size; - if (page_grstats) page_grstats[ck_g.first_page] = pagestats_g; + if (not pages.empty()) { + page_g.kernel_mask = encode_kernel_mask::PLAIN; + pages[ck_g.first_page] = page_g; + } + if (not page_sizes.empty()) { page_sizes[ck_g.first_page] = page_g.max_data_size; } + if (page_grstats) { page_grstats[ck_g.first_page] = pagestats_g; } } num_pages = 1; } @@ -508,7 +605,12 @@ __global__ void __launch_bounds__(128) page_g.num_values = values_in_page; auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page); auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page); - auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; + // get a different bound if using delta encoding + if (is_use_delta) { + page_size = + max(page_size, delta_data_len(physical_type, type_id, page_g.num_leaf_values)); + } + auto const max_data_size = page_size + def_level_size + rep_level_size + rle_pad; // page size must fit in 32-bit signed integer if (max_data_size > std::numeric_limits::max()) { CUDF_UNREACHABLE("page size exceeds maximum for i32"); @@ -528,7 +630,16 @@ __global__ void __launch_bounds__(128) } __syncwarp(); if (t == 0) { - if (not pages.empty()) { pages[ck_g.first_page + num_pages] = page_g; } + if (not pages.empty()) { + if (is_use_delta) { + page_g.kernel_mask = encode_kernel_mask::DELTA_BINARY; + } else if (ck_g.use_dictionary || physical_type == BOOLEAN) { + page_g.kernel_mask = encode_kernel_mask::DICTIONARY; + } else { + page_g.kernel_mask = encode_kernel_mask::PLAIN; + } + pages[ck_g.first_page + num_pages] = page_g; + } if (not page_sizes.empty()) { page_sizes[ck_g.first_page + num_pages] = page_g.max_data_size; } @@ -792,8 +903,12 @@ inline __device__ void PackLiterals( * @param[in] t thread id (0..127) */ static __device__ void RleEncode( - page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) + rle_page_enc_state_s* s, uint32_t numvals, uint32_t nbits, uint32_t flush, uint32_t t) { + using cudf::detail::warp_size; + auto const lane_id = t % warp_size; + auto const warp_id = t / warp_size; + uint32_t rle_pos = s->rle_pos; uint32_t rle_run = s->rle_run; @@ -801,20 +916,20 @@ static __device__ void RleEncode( uint32_t pos = rle_pos + t; if (rle_run > 0 && !(rle_run & 1)) { // Currently in a long repeat run - uint32_t mask = ballot(pos < numvals && s->vals[pos & (rle_buffer_size - 1)] == s->run_val); + uint32_t mask = ballot(pos < numvals && s->vals[rolling_idx(pos)] == s->run_val); uint32_t rle_rpt_count, max_rpt_count; - if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; } + if (lane_id == 0) { s->rpt_map[warp_id] = mask; } __syncthreads(); - if (t < 32) { + if (t < warp_size) { uint32_t c32 = ballot(t >= 4 || s->rpt_map[t] != 0xffff'ffffu); - if (!t) { + if (t == 0) { uint32_t last_idx = __ffs(c32) - 1; s->rle_rpt_count = - last_idx * 32 + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0); + last_idx * warp_size + ((last_idx < 4) ? __ffs(~s->rpt_map[last_idx]) - 1 : 0); } } __syncthreads(); - max_rpt_count = min(numvals - rle_pos, 128); + max_rpt_count = min(numvals - rle_pos, encode_block_size); rle_rpt_count = s->rle_rpt_count; rle_run += rle_rpt_count << 1; rle_pos += rle_rpt_count; @@ -831,17 +946,17 @@ static __device__ void RleEncode( } } else { // New run or in a literal run - uint32_t v0 = s->vals[pos & (rle_buffer_size - 1)]; - uint32_t v1 = s->vals[(pos + 1) & (rle_buffer_size - 1)]; + uint32_t v0 = s->vals[rolling_idx(pos)]; + uint32_t v1 = s->vals[rolling_idx(pos + 1)]; uint32_t mask = ballot(pos + 1 < numvals && v0 == v1); - uint32_t maxvals = min(numvals - rle_pos, 128); + uint32_t maxvals = min(numvals - rle_pos, encode_block_size); uint32_t rle_lit_count, rle_rpt_count; - if (!(t & 0x1f)) { s->rpt_map[t >> 5] = mask; } + if (lane_id == 0) { s->rpt_map[warp_id] = mask; } __syncthreads(); - if (t < 32) { + if (t < warp_size) { // Repeat run can only start on a multiple of 8 values - uint32_t idx8 = (t * 8) >> 5; - uint32_t pos8 = (t * 8) & 0x1f; + uint32_t idx8 = (t * 8) / warp_size; + uint32_t pos8 = (t * 8) % warp_size; uint32_t m0 = (idx8 < 4) ? s->rpt_map[idx8] : 0; uint32_t m1 = (idx8 < 3) ? s->rpt_map[idx8 + 1] : 0; uint32_t needed_mask = kRleRunMask[nbits - 1]; @@ -850,8 +965,8 @@ static __device__ void RleEncode( uint32_t rle_run_start = (mask != 0) ? min((__ffs(mask) - 1) * 8, maxvals) : maxvals; uint32_t rpt_len = 0; if (rle_run_start < maxvals) { - uint32_t idx_cur = rle_run_start >> 5; - uint32_t idx_ofs = rle_run_start & 0x1f; + uint32_t idx_cur = rle_run_start / warp_size; + uint32_t idx_ofs = rle_run_start % warp_size; while (idx_cur < 4) { m0 = (idx_cur < 4) ? s->rpt_map[idx_cur] : 0; m1 = (idx_cur < 3) ? s->rpt_map[idx_cur + 1] : 0; @@ -860,7 +975,7 @@ static __device__ void RleEncode( rpt_len += __ffs(mask) - 1; break; } - rpt_len += 32; + rpt_len += warp_size; idx_cur++; } } @@ -931,7 +1046,7 @@ static __device__ void RleEncode( * @param[in] flush nonzero if last batch in block * @param[in] t thread id (0..127) */ -static __device__ void PlainBoolEncode(page_enc_state_s* s, +static __device__ void PlainBoolEncode(rle_page_enc_state_s* s, uint32_t numvals, uint32_t flush, uint32_t t) @@ -941,7 +1056,7 @@ static __device__ void PlainBoolEncode(page_enc_state_s* s, while (rle_pos < numvals) { uint32_t pos = rle_pos + t; - uint32_t v = (pos < numvals) ? s->vals[pos & (rle_buffer_size - 1)] : 0; + uint32_t v = (pos < numvals) ? s->vals[rolling_idx(pos)] : 0; uint32_t n = min(numvals - rle_pos, 128); uint32_t nbytes = (n + ((flush) ? 7 : 0)) >> 3; if (!nbytes) { break; } @@ -995,28 +1110,22 @@ __device__ auto julian_days_with_time(int64_t v) return std::make_pair(dur_time_of_day_nanos, julian_days); } +// this has been split out into its own kernel because of the amount of shared memory required +// for the state buffer. encode kernels that don't use the RLE buffer can get started while +// the level data is encoded. // blockDim(128, 1, 1) template -__global__ void __launch_bounds__(128, 8) - gpuEncodePages(device_span pages, - device_span> comp_in, - device_span> comp_out, - device_span comp_results, - bool write_v2_headers) +__global__ void __launch_bounds__(block_size, 8) gpuEncodePageLevels(device_span pages, + bool write_v2_headers, + encode_kernel_mask kernel_mask) { - __shared__ __align__(8) page_enc_state_s state_g; - using block_reduce = cub::BlockReduce; - using block_scan = cub::BlockScan; - __shared__ union { - typename block_reduce::TempStorage reduce_storage; - typename block_scan::TempStorage scan_storage; - } temp_storage; + __shared__ __align__(8) rle_page_enc_state_s state_g; - page_enc_state_s* const s = &state_g; - auto const t = threadIdx.x; + auto* const s = &state_g; + uint32_t const t = threadIdx.x; if (t == 0) { - state_g = page_enc_state_s{}; + state_g = rle_page_enc_state_s{}; s->page = pages[blockIdx.x]; s->ck = *s->page.chunk; s->col = *s->ck.col_desc; @@ -1029,6 +1138,8 @@ __global__ void __launch_bounds__(128, 8) } __syncthreads(); + if (BitAnd(s->page.kernel_mask, kernel_mask) == 0) { return; } + auto const is_v2 = s->page.page_type == PageType::DATA_PAGE_V2; // Encode Repetition and Definition levels @@ -1081,23 +1192,24 @@ __global__ void __launch_bounds__(128, 8) } while (is_col_struct); return def; }(); - s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = def_lvl; + s->vals[rolling_idx(rle_numvals + t)] = def_lvl; __syncthreads(); rle_numvals += nrows; RleEncode(s, rle_numvals, def_lvl_bits, (rle_numvals == s->page.num_rows), t); __syncthreads(); } if (t < 32) { - uint8_t* const cur = s->cur; - uint8_t* const rle_out = s->rle_out; - uint32_t const rle_bytes = static_cast(rle_out - cur) - (is_v2 ? 0 : 4); - if (is_v2 && t == 0) { + uint8_t* const cur = s->cur; + uint8_t* const rle_out = s->rle_out; + // V2 does not write the RLE length field + uint32_t const rle_bytes = + static_cast(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN); + if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); } + __syncwarp(); + if (t == 0) { + s->cur = rle_out; s->page.def_lvl_bytes = rle_bytes; - } else if (not is_v2 && t < 4) { - cur[t] = rle_bytes >> (t * 8); } - __syncwarp(); - if (t == 0) { s->cur = rle_out; } } } } else if (s->page.page_type != PageType::DICTIONARY_PAGE && @@ -1124,29 +1236,122 @@ __global__ void __launch_bounds__(128, 8) uint32_t idx = page_first_val_idx + rle_numvals + t; uint32_t lvl_val = (rle_numvals + t < s->page.num_values && idx < col_last_val_idx) ? lvl_val_data[idx] : 0; - s->vals[(rle_numvals + t) & (rle_buffer_size - 1)] = lvl_val; + s->vals[rolling_idx(rle_numvals + t)] = lvl_val; __syncthreads(); rle_numvals += nvals; RleEncode(s, rle_numvals, nbits, (rle_numvals == s->page.num_values), t); __syncthreads(); } if (t < 32) { - uint8_t* const cur = s->cur; - uint8_t* const rle_out = s->rle_out; - uint32_t const rle_bytes = static_cast(rle_out - cur) - (is_v2 ? 0 : 4); - if (is_v2 && t == 0) { + uint8_t* const cur = s->cur; + uint8_t* const rle_out = s->rle_out; + // V2 does not write the RLE length field + uint32_t const rle_bytes = + static_cast(rle_out - cur) - (is_v2 ? 0 : RLE_LENGTH_FIELD_LEN); + if (not is_v2 && t < RLE_LENGTH_FIELD_LEN) { cur[t] = rle_bytes >> (t * 8); } + __syncwarp(); + if (t == 0) { + s->cur = rle_out; lvl_bytes = rle_bytes; - } else if (not is_v2 && t < 4) { - cur[t] = rle_bytes >> (t * 8); } - __syncwarp(); - if (t == 0) { s->cur = rle_out; } } }; encode_levels(s->col.rep_values, s->col.num_rep_level_bits(), s->page.rep_lvl_bytes); __syncthreads(); encode_levels(s->col.def_values, s->col.num_def_level_bits(), s->page.def_lvl_bytes); } + + if (t == 0) { pages[blockIdx.x] = s->page; } +} + +template +__device__ void finish_page_encode(state_buf* s, + uint32_t valid_count, + uint8_t const* end_ptr, + device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + auto const t = threadIdx.x; + + // V2 does not compress rep and def level data + size_t const skip_comp_size = + write_v2_headers ? s->page.def_lvl_bytes + s->page.rep_lvl_bytes : 0; + + if (t == 0) { + // only need num_nulls for v2 data page headers + if (write_v2_headers) { s->page.num_nulls = s->page.num_values - valid_count; } + uint8_t const* const base = s->page.page_data + s->page.max_hdr_size; + auto const actual_data_size = static_cast(end_ptr - base); + if (actual_data_size > s->page.max_data_size) { + // FIXME(ets): this needs to do error propagation back to the host + CUDF_UNREACHABLE("detected possible page data corruption"); + } + s->page.max_data_size = actual_data_size; + if (not comp_in.empty()) { + comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, + 0}; // size is unused + } + pages[blockIdx.x] = s->page; + if (not comp_results.empty()) { + comp_results[blockIdx.x] = {0, compression_status::FAILURE}; + pages[blockIdx.x].comp_res = &comp_results[blockIdx.x]; + } + } + + // copy uncompressed bytes over + if (skip_comp_size != 0 && not comp_in.empty()) { + uint8_t* const src = s->page.page_data + s->page.max_hdr_size; + uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; + for (int i = t; i < skip_comp_size; i += block_size) { + dst[i] = src[i]; + } + } +} + +// PLAIN page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodePages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + __shared__ __align__(8) page_enc_state_s<0> state_g; + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; + + if (t == 0) { + state_g = page_enc_state_s<0>{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + // if V1 data page, need space for the RLE length fields + if (s->page.page_type == PageType::DATA_PAGE) { + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + } + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; } + // Encode data values __syncthreads(); auto const physical_type = s->col.physical_type; @@ -1158,10 +1363,6 @@ __global__ void __launch_bounds__(128, 8) return dtype_len_out; }(); - auto const dict_bits = (physical_type == BOOLEAN) ? 1 - : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE) - ? s->ck.dict_rle_bits - : -1; if (t == 0) { uint8_t* dst = s->cur; s->rle_run = 0; @@ -1170,219 +1371,315 @@ __global__ void __launch_bounds__(128, 8) s->rle_out = dst; s->page.encoding = determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers); - if (dict_bits >= 0 && physical_type != BOOLEAN) { - dst[0] = dict_bits; - s->rle_out = dst + 1; - } else if (is_v2 && physical_type == BOOLEAN) { - // save space for RLE length. we don't know the total length yet. - s->rle_out = dst + RLE_LENGTH_FIELD_LEN; - s->rle_len_pos = dst; - } s->page_start_val = row_to_value_idx(s->page.start_row, s->col); s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); } __syncthreads(); + uint32_t num_valid = 0; for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { - uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, 128); + uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); uint32_t len, pos; auto [is_valid, val_idx] = [&]() { uint32_t val_idx; uint32_t is_valid; - size_type val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_block = cur_val_idx + t; if (s->page.page_type == PageType::DICTIONARY_PAGE) { val_idx = val_idx_in_block; is_valid = (val_idx < s->page.num_leaf_values); if (is_valid) { val_idx = s->ck.dict_data[val_idx]; } } else { - size_type val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() && val_idx_in_block < s->page.num_leaf_values) ? s->col.leaf_column->is_valid(val_idx_in_leaf_col) : 0; - val_idx = - (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; + val_idx = val_idx_in_leaf_col; } return std::make_tuple(is_valid, val_idx); }(); - if (is_valid) num_valid++; - + if (is_valid) { num_valid++; } cur_val_idx += nvals; - if (dict_bits >= 0) { - // Dictionary encoding - if (dict_bits > 0) { - uint32_t rle_numvals; - uint32_t rle_numvals_in_block; - block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block); - rle_numvals = s->rle_numvals; - if (is_valid) { - uint32_t v; - if (physical_type == BOOLEAN) { - v = s->col.leaf_column->element(val_idx); - } else { - v = s->ck.dict_index[val_idx]; - } - s->vals[(rle_numvals + pos) & (rle_buffer_size - 1)] = v; - } - rle_numvals += rle_numvals_in_block; - __syncthreads(); - if (!is_v2 && physical_type == BOOLEAN) { - PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t); - } else { - RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t); + + // Non-dictionary encoding + uint8_t* dst = s->cur; + + if (is_valid) { + len = dtype_len_out; + if (physical_type == BYTE_ARRAY) { + if (type_id == type_id::STRING) { + len += s->col.leaf_column->element(val_idx).size_bytes(); + } else if (s->col.output_as_byte_array && type_id == type_id::LIST) { + len += + get_element(*s->col.leaf_column, val_idx).size_bytes(); } - __syncthreads(); } - if (t == 0) { s->cur = s->rle_out; } - __syncthreads(); } else { - // Non-dictionary encoding - uint8_t* dst = s->cur; - - if (is_valid) { - len = dtype_len_out; - if (physical_type == BYTE_ARRAY) { - if (type_id == type_id::STRING) { - len += s->col.leaf_column->element(val_idx).size_bytes(); - } else if (s->col.output_as_byte_array && type_id == type_id::LIST) { - len += - get_element(*s->col.leaf_column, val_idx).size_bytes(); + len = 0; + } + uint32_t total_len = 0; + block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len); + __syncthreads(); + if (t == 0) { s->cur = dst + total_len; } + if (is_valid) { + switch (physical_type) { + case INT32: [[fallthrough]]; + case FLOAT: { + auto const v = [dtype_len = dtype_len_in, + idx = val_idx, + col = s->col.leaf_column, + scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t { + switch (dtype_len) { + case 8: return col->element(idx) * scale; + case 4: return col->element(idx) * scale; + case 2: return col->element(idx) * scale; + default: return col->element(idx) * scale; + } + }(); + + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + } break; + case INT64: { + int64_t v = s->col.leaf_column->element(val_idx); + int32_t ts_scale = s->col.ts_scale; + if (ts_scale != 0) { + if (ts_scale < 0) { + v /= -ts_scale; + } else { + v *= ts_scale; + } + } + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + dst[pos + 4] = v >> 32; + dst[pos + 5] = v >> 40; + dst[pos + 6] = v >> 48; + dst[pos + 7] = v >> 56; + } break; + case INT96: { + int64_t v = s->col.leaf_column->element(val_idx); + int32_t ts_scale = s->col.ts_scale; + if (ts_scale != 0) { + if (ts_scale < 0) { + v /= -ts_scale; + } else { + v *= ts_scale; + } } - } - } else { - len = 0; - } - uint32_t total_len = 0; - block_scan(temp_storage.scan_storage).ExclusiveSum(len, pos, total_len); - __syncthreads(); - if (t == 0) { s->cur = dst + total_len; } - if (is_valid) { - switch (physical_type) { - case INT32: [[fallthrough]]; - case FLOAT: { - auto const v = [dtype_len = dtype_len_in, - idx = val_idx, - col = s->col.leaf_column, - scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t { - switch (dtype_len) { - case 8: return col->element(idx) * scale; - case 4: return col->element(idx) * scale; - case 2: return col->element(idx) * scale; - default: return col->element(idx) * scale; - } - }(); - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - } break; - case INT64: { - int64_t v = s->col.leaf_column->element(val_idx); - int32_t ts_scale = s->col.ts_scale; - if (ts_scale != 0) { - if (ts_scale < 0) { - v /= -ts_scale; - } else { - v *= ts_scale; - } + auto const [last_day_nanos, julian_days] = [&] { + using namespace cuda::std::chrono; + switch (s->col.leaf_column->type().id()) { + case type_id::TIMESTAMP_SECONDS: + case type_id::TIMESTAMP_MILLISECONDS: { + return julian_days_with_time(v); + } break; + case type_id::TIMESTAMP_MICROSECONDS: + case type_id::TIMESTAMP_NANOSECONDS: { + return julian_days_with_time(v); + } break; } - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - dst[pos + 4] = v >> 32; - dst[pos + 5] = v >> 40; - dst[pos + 6] = v >> 48; - dst[pos + 7] = v >> 56; - } break; - case INT96: { - int64_t v = s->col.leaf_column->element(val_idx); - int32_t ts_scale = s->col.ts_scale; - if (ts_scale != 0) { - if (ts_scale < 0) { - v /= -ts_scale; - } else { - v *= ts_scale; - } + return julian_days_with_time(0); + }(); + + // the 12 bytes of fixed length data. + v = last_day_nanos.count(); + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + dst[pos + 4] = v >> 32; + dst[pos + 5] = v >> 40; + dst[pos + 6] = v >> 48; + dst[pos + 7] = v >> 56; + uint32_t w = julian_days.count(); + dst[pos + 8] = w; + dst[pos + 9] = w >> 8; + dst[pos + 10] = w >> 16; + dst[pos + 11] = w >> 24; + } break; + + case DOUBLE: { + auto v = s->col.leaf_column->element(val_idx); + memcpy(dst + pos, &v, 8); + } break; + case BYTE_ARRAY: { + auto const bytes = [](cudf::type_id const type_id, + column_device_view const* leaf_column, + uint32_t const val_idx) -> void const* { + switch (type_id) { + case type_id::STRING: + return reinterpret_cast( + leaf_column->element(val_idx).data()); + case type_id::LIST: + return reinterpret_cast( + get_element(*(leaf_column), val_idx).data()); + default: CUDF_UNREACHABLE("invalid type id for byte array writing!"); } + }(type_id, s->col.leaf_column, val_idx); + uint32_t v = len - 4; // string length + dst[pos + 0] = v; + dst[pos + 1] = v >> 8; + dst[pos + 2] = v >> 16; + dst[pos + 3] = v >> 24; + if (v != 0) memcpy(dst + pos + 4, bytes, v); + } break; + case FIXED_LEN_BYTE_ARRAY: { + if (type_id == type_id::DECIMAL128) { + // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian + auto const v = s->col.leaf_column->element(val_idx).value(); + auto const v_char_ptr = reinterpret_cast(&v); + thrust::copy(thrust::seq, + thrust::make_reverse_iterator(v_char_ptr + sizeof(v)), + thrust::make_reverse_iterator(v_char_ptr), + dst + pos); + } + } break; + } + } + __syncthreads(); + } - auto const [last_day_nanos, julian_days] = [&] { - using namespace cuda::std::chrono; - switch (s->col.leaf_column->type().id()) { - case type_id::TIMESTAMP_SECONDS: - case type_id::TIMESTAMP_MILLISECONDS: { - return julian_days_with_time(v); - } break; - case type_id::TIMESTAMP_MICROSECONDS: - case type_id::TIMESTAMP_NANOSECONDS: { - return julian_days_with_time(v); - } break; - } - return julian_days_with_time(0); - }(); - - // the 12 bytes of fixed length data. - v = last_day_nanos.count(); - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - dst[pos + 4] = v >> 32; - dst[pos + 5] = v >> 40; - dst[pos + 6] = v >> 48; - dst[pos + 7] = v >> 56; - uint32_t w = julian_days.count(); - dst[pos + 8] = w; - dst[pos + 9] = w >> 8; - dst[pos + 10] = w >> 16; - dst[pos + 11] = w >> 24; - } break; + uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); - case DOUBLE: { - auto v = s->col.leaf_column->element(val_idx); - memcpy(dst + pos, &v, 8); - } break; - case BYTE_ARRAY: { - auto const bytes = [](cudf::type_id const type_id, - column_device_view const* leaf_column, - uint32_t const val_idx) -> void const* { - switch (type_id) { - case type_id::STRING: - return reinterpret_cast( - leaf_column->element(val_idx).data()); - case type_id::LIST: - return reinterpret_cast( - get_element(*(leaf_column), val_idx).data()); - default: CUDF_UNREACHABLE("invalid type id for byte array writing!"); - } - }(type_id, s->col.leaf_column, val_idx); - uint32_t v = len - 4; // string length - dst[pos + 0] = v; - dst[pos + 1] = v >> 8; - dst[pos + 2] = v >> 16; - dst[pos + 3] = v >> 24; - if (v != 0) memcpy(dst + pos + 4, bytes, v); - } break; - case FIXED_LEN_BYTE_ARRAY: { - if (type_id == type_id::DECIMAL128) { - // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian - auto const v = s->col.leaf_column->element(val_idx).value(); - auto const v_char_ptr = reinterpret_cast(&v); - thrust::copy(thrust::seq, - thrust::make_reverse_iterator(v_char_ptr + sizeof(v)), - thrust::make_reverse_iterator(v_char_ptr), - dst + pos); - } - } break; + finish_page_encode( + s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers); +} + +// DICTIONARY page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodeDictPages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results, + bool write_v2_headers) +{ + __shared__ __align__(8) rle_page_enc_state_s state_g; + using block_reduce = cub::BlockReduce; + using block_scan = cub::BlockScan; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename block_scan::TempStorage scan_storage; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; + + if (t == 0) { + state_g = rle_page_enc_state_s{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + // if V1 data page, need space for the RLE length fields + if (s->page.page_type == PageType::DATA_PAGE) { + if (s->col.num_def_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + if (s->col.num_rep_level_bits() != 0) { s->cur += RLE_LENGTH_FIELD_LEN; } + } + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DICTIONARY) == 0) { return; } + + // Encode data values + __syncthreads(); + auto const physical_type = s->col.physical_type; + auto const type_id = s->col.leaf_column->type().id(); + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len_in = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + // TODO assert dict_bits >= 0 + auto const dict_bits = (physical_type == BOOLEAN) ? 1 + : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE) + ? s->ck.dict_rle_bits + : -1; + if (t == 0) { + uint8_t* dst = s->cur; + s->rle_run = 0; + s->rle_pos = 0; + s->rle_numvals = 0; + s->rle_out = dst; + s->page.encoding = + determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers); + if (dict_bits >= 0 && physical_type != BOOLEAN) { + dst[0] = dict_bits; + s->rle_out = dst + 1; + } else if (write_v2_headers && physical_type == BOOLEAN) { + // save space for RLE length. we don't know the total length yet. + s->rle_out = dst + RLE_LENGTH_FIELD_LEN; + s->rle_len_pos = dst; + } + s->page_start_val = row_to_value_idx(s->page.start_row, s->col); + s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); + } + __syncthreads(); + + uint32_t num_valid = 0; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); + + auto [is_valid, val_idx] = [&]() { + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + + uint32_t const is_valid = (val_idx_in_leaf_col < s->col.leaf_column->size() && + val_idx_in_block < s->page.num_leaf_values) + ? s->col.leaf_column->is_valid(val_idx_in_leaf_col) + : 0; + // need to test for use_dictionary because it might be boolean + uint32_t const val_idx = + (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; + return std::make_tuple(is_valid, val_idx); + }(); + + if (is_valid) { num_valid++; } + cur_val_idx += nvals; + + // Dictionary encoding + if (dict_bits > 0) { + uint32_t rle_numvals; + uint32_t rle_numvals_in_block; + uint32_t pos; + block_scan(temp_storage.scan_storage).ExclusiveSum(is_valid, pos, rle_numvals_in_block); + rle_numvals = s->rle_numvals; + if (is_valid) { + uint32_t v; + if (physical_type == BOOLEAN) { + v = s->col.leaf_column->element(val_idx); + } else { + v = s->ck.dict_index[val_idx]; } + s->vals[rolling_idx(rle_numvals + pos)] = v; + } + rle_numvals += rle_numvals_in_block; + __syncthreads(); + if ((!write_v2_headers) && (physical_type == BOOLEAN)) { + PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t); + } else { + RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t); } __syncthreads(); } + if (t == 0) { s->cur = s->rle_out; } + __syncthreads(); } uint32_t const valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); @@ -1395,37 +1692,137 @@ __global__ void __launch_bounds__(128, 8) __syncwarp(); } - // V2 does not compress rep and def level data - size_t const skip_comp_size = s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + finish_page_encode( + s, valid_count, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers); +} + +// DELTA_BINARY_PACKED page data encoder +// blockDim(128, 1, 1) +template +__global__ void __launch_bounds__(block_size, 8) + gpuEncodeDeltaBinaryPages(device_span pages, + device_span> comp_in, + device_span> comp_out, + device_span comp_results) +{ + // block of shared memory for value storage and bit packing + __shared__ uleb128_t delta_shared[delta::buffer_size + delta::block_size]; + __shared__ __align__(8) page_enc_state_s<0> state_g; + using block_reduce = cub::BlockReduce; + __shared__ union { + typename block_reduce::TempStorage reduce_storage; + typename delta::index_scan::TempStorage delta_index_tmp; + typename delta::block_reduce::TempStorage delta_reduce_tmp; + typename delta::warp_reduce::TempStorage delta_warp_red_tmp[delta::num_mini_blocks]; + } temp_storage; + + auto* const s = &state_g; + uint32_t t = threadIdx.x; if (t == 0) { - s->page.num_nulls = s->page.num_values - valid_count; - uint8_t* const base = s->page.page_data + s->page.max_hdr_size; - auto const actual_data_size = static_cast(s->cur - base); - if (actual_data_size > s->page.max_data_size) { - CUDF_UNREACHABLE("detected possible page data corruption"); - } - s->page.max_data_size = actual_data_size; - if (not comp_in.empty()) { - comp_in[blockIdx.x] = {base + skip_comp_size, actual_data_size - skip_comp_size}; - comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size + skip_comp_size, - 0}; // size is unused - } - pages[blockIdx.x] = s->page; - if (not comp_results.empty()) { - comp_results[blockIdx.x] = {0, compression_status::FAILURE}; - pages[blockIdx.x].comp_res = &comp_results[blockIdx.x]; + state_g = page_enc_state_s<0>{}; + s->page = pages[blockIdx.x]; + s->ck = *s->page.chunk; + s->col = *s->ck.col_desc; + s->rle_len_pos = nullptr; + // get s->cur back to where it was at the end of encoding the rep and def level data + s->cur = + s->page.page_data + s->page.max_hdr_size + s->page.def_lvl_bytes + s->page.rep_lvl_bytes; + } + __syncthreads(); + + if (BitAnd(s->page.kernel_mask, encode_kernel_mask::DELTA_BINARY) == 0) { return; } + + // Encode data values + __syncthreads(); + auto const physical_type = s->col.physical_type; + auto const type_id = s->col.leaf_column->type().id(); + auto const dtype_len_out = physical_type_len(physical_type, type_id); + auto const dtype_len_in = [&]() -> uint32_t { + if (physical_type == INT32) { return int32_logical_len(type_id); } + if (physical_type == INT96) { return sizeof(int64_t); } + return dtype_len_out; + }(); + + if (t == 0) { + uint8_t* dst = s->cur; + s->rle_run = 0; + s->rle_pos = 0; + s->rle_numvals = 0; + s->rle_out = dst; + s->page.encoding = Encoding::DELTA_BINARY_PACKED; + s->page_start_val = row_to_value_idx(s->page.start_row, s->col); + s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col); + } + __syncthreads(); + + // need to know the number of valid values for the null values calculation and to size + // the delta binary encoder. + uint32_t valid_count = 0; + if (not s->col.leaf_column->nullable()) { + valid_count = s->page.num_leaf_values; + } else { + uint32_t num_valid = 0; + for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { + uint32_t const nvals = min(s->page.num_leaf_values - cur_val_idx, block_size); + size_type const val_idx_in_block = cur_val_idx + t; + size_type const val_idx_in_leaf_col = s->page_start_val + val_idx_in_block; + + if (val_idx_in_leaf_col < s->col.leaf_column->size() && + val_idx_in_block < s->page.num_leaf_values && + s->col.leaf_column->is_valid(val_idx_in_leaf_col)) { + num_valid++; + } + cur_val_idx += nvals; } + valid_count = block_reduce(temp_storage.reduce_storage).Sum(num_valid); } - // copy over uncompressed data - if (skip_comp_size != 0 && not comp_in.empty()) { - uint8_t const* const src = s->page.page_data + s->page.max_hdr_size; - uint8_t* const dst = s->page.compressed_data + s->page.max_hdr_size; - for (int i = t; i < skip_comp_size; i += block_size) { - dst[i] = src[i]; + uint8_t const* delta_ptr = nullptr; // this will be the end of delta block pointer + + if (physical_type == INT32) { + switch (dtype_len_in) { + case 8: { + // only DURATIONS map to 8 bytes, so safe to just use signed here? + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + break; + } + case 4: { + if (type_id == type_id::UINT32) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + case 2: { + if (type_id == type_id::UINT16) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + case 1: { + if (type_id == type_id::UINT8) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } + break; + } + default: CUDF_UNREACHABLE("invalid dtype_len_in when encoding DELTA_BINARY_PACKED"); + } + } else { + if (type_id == type_id::UINT64) { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); + } else { + delta_ptr = delta_encode(s, valid_count, delta_shared, &temp_storage); } } + + finish_page_encode( + s, valid_count, delta_ptr, pages, comp_in, comp_out, comp_results, true); } constexpr int decide_compression_warps_in_block = 4; @@ -1460,7 +1857,8 @@ __global__ void __launch_bounds__(decide_compression_block_size) for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) { auto const& curr_page = ck_g[warp_id].pages[page_id]; auto const page_data_size = curr_page.max_data_size; - auto const lvl_bytes = curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes; + auto const is_v2 = curr_page.page_type == PageType::DATA_PAGE_V2; + auto const lvl_bytes = is_v2 ? curr_page.def_lvl_bytes + curr_page.rep_lvl_bytes : 0; uncompressed_data_size += page_data_size; if (auto comp_res = curr_page.comp_res; comp_res != nullptr) { compressed_data_size += comp_res->bytes_written + lvl_bytes; @@ -1923,7 +2321,8 @@ __global__ void __launch_bounds__(128) } uncompressed_page_size = page_g.max_data_size; if (ck_g.is_compressed) { - auto const lvl_bytes = page_g.def_lvl_bytes + page_g.rep_lvl_bytes; + auto const is_v2 = page_g.page_type == PageType::DATA_PAGE_V2; + auto const lvl_bytes = is_v2 ? page_g.def_lvl_bytes + page_g.rep_lvl_bytes : 0; hdr_start = page_g.compressed_data; compressed_page_size = static_cast(comp_results[blockIdx.x].bytes_written) + lvl_bytes; @@ -1988,7 +2387,7 @@ __global__ void __launch_bounds__(128) // blockDim(1024, 1, 1) __global__ void __launch_bounds__(1024) - gpuGatherPages(device_span chunks, device_span pages) + gpuGatherPages(device_span chunks, device_span pages) { __shared__ __align__(8) EncColumnChunk ck_g; __shared__ __align__(8) EncPage page_g; @@ -2158,6 +2557,10 @@ constexpr __device__ void* align8(void* ptr) return static_cast(ptr) - algn; } +struct mask_tform { + __device__ uint32_t operator()(EncPage const& p) { return static_cast(p.kernel_mask); } +}; + } // namespace // blockDim(1, 1, 1) @@ -2260,12 +2663,13 @@ void InitFragmentStatistics(device_span groups, rmm::cuda_stream_view stream) { int const num_fragments = fragments.size(); - int const dim = util::div_rounding_up_safe(num_fragments, 128 / cudf::detail::warp_size); - gpuInitFragmentStats<<>>(groups, fragments); + int const dim = + util::div_rounding_up_safe(num_fragments, encode_block_size / cudf::detail::warp_size); + gpuInitFragmentStats<<>>(groups, fragments); } void InitEncoderPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -2280,21 +2684,21 @@ void InitEncoderPages(device_2dspan chunks, { auto num_rowgroups = chunks.size().first; dim3 dim_grid(num_columns, num_rowgroups); // 1 threadblock per rowgroup - gpuInitPages<<>>(chunks, - pages, - page_sizes, - comp_page_sizes, - col_desc, - page_grstats, - chunk_grstats, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_align, - write_v2_headers); + gpuInitPages<<>>(chunks, + pages, + page_sizes, + comp_page_sizes, + col_desc, + page_grstats, + chunk_grstats, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_align, + write_v2_headers); } -void EncodePages(device_span pages, +void EncodePages(device_span pages, bool write_v2_headers, device_span> comp_in, device_span> comp_out, @@ -2302,10 +2706,43 @@ void EncodePages(device_span pages, rmm::cuda_stream_view stream) { auto num_pages = pages.size(); + + // determine which kernels to invoke + auto mask_iter = thrust::make_transform_iterator(pages.begin(), mask_tform{}); + uint32_t kernel_mask = thrust::reduce( + rmm::exec_policy(stream), mask_iter, mask_iter + pages.size(), 0U, thrust::bit_or{}); + + // get the number of streams we need from the pool + int nkernels = std::bitset<32>(kernel_mask).count(); + auto streams = cudf::detail::fork_streams(stream, nkernels); + // A page is part of one column. This is launching 1 block per page. 1 block will exclusively // deal with one datatype. - gpuEncodePages<128><<>>( - pages, comp_in, comp_out, comp_results, write_v2_headers); + + int s_idx = 0; + if (BitAnd(kernel_mask, encode_kernel_mask::PLAIN) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::PLAIN); + gpuEncodePages<<>>( + pages, comp_in, comp_out, comp_results, write_v2_headers); + } + if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::DELTA_BINARY); + gpuEncodeDeltaBinaryPages + <<>>(pages, comp_in, comp_out, comp_results); + } + if (BitAnd(kernel_mask, encode_kernel_mask::DICTIONARY) != 0) { + auto const strm = streams[s_idx++]; + gpuEncodePageLevels<<>>( + pages, write_v2_headers, encode_kernel_mask::DICTIONARY); + gpuEncodeDictPages<<>>( + pages, comp_in, comp_out, comp_results, write_v2_headers); + } + + cudf::detail::join_streams(streams, stream); } void DecideCompression(device_span chunks, rmm::cuda_stream_view stream) @@ -2323,12 +2760,12 @@ void EncodePageHeaders(device_span pages, { // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the // threads to coop load structs - gpuEncodePageHeaders<<>>( + gpuEncodePageHeaders<<>>( pages, comp_results, page_stats, chunk_stats); } void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream) { gpuGatherPages<<>>(chunks, pages); @@ -2343,7 +2780,4 @@ void EncodeColumnIndexes(device_span chunks, chunks, column_stats, column_index_truncate_length); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 6f8b2f50443..595dd40cdc2 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -16,34 +16,18 @@ #include "parquet_gpu.hpp" #include + +#include + #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + // Minimal thrift implementation for parsing page headers // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md -static const __device__ __constant__ uint8_t g_list2struct[16] = {0, - 1, - 2, - ST_FLD_BYTE, - ST_FLD_DOUBLE, - 5, - ST_FLD_I16, - 7, - ST_FLD_I32, - 9, - ST_FLD_I64, - ST_FLD_BINARY, - ST_FLD_STRUCT, - ST_FLD_MAP, - ST_FLD_SET, - ST_FLD_LIST}; - struct byte_stream_s { uint8_t const* cur{}; uint8_t const* end{}; @@ -142,12 +126,13 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) case ST_FLD_SET: { // NOTE: skipping a list of lists is not handled auto const c = getb(bs); int n = c >> 4; - if (n == 0xf) n = get_u32(bs); - field_type = g_list2struct[c & 0xf]; - if (field_type == ST_FLD_STRUCT) + if (n == 0xf) { n = get_u32(bs); } + field_type = c & 0xf; + if (field_type == ST_FLD_STRUCT) { struct_depth += n; - else + } else { rep_cnt = n; + } } break; case ST_FLD_STRUCT: struct_depth++; break; } @@ -161,19 +146,21 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) * @param chunk Column chunk the page belongs to * @return `kernel_mask_bits` value for the given page */ -__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, - gpu::ColumnChunkDesc const& chunk) +__device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page, + ColumnChunkDesc const& chunk) { - if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return decode_kernel_mask::NONE; } if (page.encoding == Encoding::DELTA_BINARY_PACKED) { - return KERNEL_MASK_DELTA_BINARY; + return decode_kernel_mask::DELTA_BINARY; + } else if (page.encoding == Encoding::DELTA_BYTE_ARRAY) { + return decode_kernel_mask::DELTA_BYTE_ARRAY; } else if (is_string_col(chunk)) { - return KERNEL_MASK_STRING; + return decode_kernel_mask::STRING; } // non-string, non-delta - return KERNEL_MASK_GENERAL; + return decode_kernel_mask::GENERAL; } /** @@ -359,16 +346,20 @@ struct gpuParsePageHeader { */ // blockDim {128,1,1} __global__ void __launch_bounds__(128) - gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks) + gpuDecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, int32_t* error_code) { + using cudf::detail::warp_size; gpuParsePageHeader parse_page_header; __shared__ byte_stream_s bs_g[4]; - int lane_id = threadIdx.x % 32; - int chunk = (blockIdx.x * 4) + (threadIdx.x / 32); - byte_stream_s* const bs = &bs_g[threadIdx.x / 32]; + int32_t error[4] = {0}; + auto const lane_id = threadIdx.x % warp_size; + auto const warp_id = threadIdx.x / warp_size; + auto const chunk = (blockIdx.x * 4) + warp_id; + auto const bs = &bs_g[warp_id]; - if (chunk < num_chunks and lane_id == 0) bs->ck = chunks[chunk]; + if (chunk < num_chunks and lane_id == 0) { bs->ck = chunks[chunk]; } + if (lane_id == 0) { error[warp_id] = 0; } __syncthreads(); if (chunk < num_chunks) { @@ -379,7 +370,7 @@ __global__ void __launch_bounds__(128) int32_t num_dict_pages = bs->ck.num_dict_pages; PageInfo* page_info; - if (!lane_id) { + if (lane_id == 0) { bs->base = bs->cur = bs->ck.compressed_data; bs->end = bs->base + bs->ck.compressed_size; bs->page.chunk_idx = chunk; @@ -392,7 +383,9 @@ __global__ void __launch_bounds__(128) bs->page.skipped_values = -1; bs->page.skipped_leaf_values = 0; bs->page.str_bytes = 0; - bs->page.kernel_mask = 0; + bs->page.temp_string_size = 0; + bs->page.temp_string_buf = nullptr; + bs->page.kernel_mask = decode_kernel_mask::NONE; } num_values = bs->ck.num_values; page_info = bs->ck.page_info; @@ -415,6 +408,9 @@ __global__ void __launch_bounds__(128) bs->page.lvl_bytes[level_type::DEFINITION] = 0; bs->page.lvl_bytes[level_type::REPETITION] = 0; if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) { + if (not is_supported_encoding(bs->page.encoding)) { + error[warp_id] |= static_cast(decode_error::UNSUPPORTED_ENCODING); + } switch (bs->page_type) { case PageType::DATA_PAGE: index_out = num_dict_pages + data_page_count; @@ -443,20 +439,25 @@ __global__ void __launch_bounds__(128) } bs->page.page_data = const_cast(bs->cur); bs->cur += bs->page.compressed_page_size; + if (bs->cur > bs->end) { + error[warp_id] |= static_cast(decode_error::DATA_STREAM_OVERRUN); + } bs->page.kernel_mask = kernel_mask_for_page(bs->page, bs->ck); } else { bs->cur = bs->end; } } index_out = shuffle(index_out); - if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) + if (index_out >= 0 && index_out < max_num_pages && lane_id == 0) { page_info[index_out] = bs->page; + } num_values = shuffle(num_values); __syncwarp(); } if (lane_id == 0) { chunks[chunk].num_data_pages = data_page_count; chunks[chunk].num_dict_pages = dictionary_page_count; + if (error[warp_id] != 0) { set_error(error[warp_id], error_code); } } } } @@ -512,11 +513,12 @@ __global__ void __launch_bounds__(128) void __host__ DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, + int32_t* error_code, rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); dim3 dim_grid((num_chunks + 3) >> 2, 1); // 1 chunk per warp, 4 warps per block - gpuDecodePageHeaders<<>>(chunks, num_chunks); + gpuDecodePageHeaders<<>>(chunks, num_chunks, error_code); } void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, @@ -528,7 +530,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, gpuBuildStringDictionaryIndex<<>>(chunks, num_chunks); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d79abe4a6d2..e9ac3657e36 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -14,23 +14,28 @@ * limitations under the License. */ +#include "delta_binary.cuh" #include "page_decode.cuh" #include "page_string_utils.cuh" #include +#include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +#include +#include + +#include + +namespace cudf::io::parquet::detail { namespace { -constexpr int preprocess_block_size = 512; -constexpr int decode_block_size = 128; -constexpr int rolling_buf_size = decode_block_size * 2; -constexpr int preproc_buf_size = LEVEL_DECODE_BUF_SIZE; +constexpr int preprocess_block_size = 512; +constexpr int decode_block_size = 128; +constexpr int delta_preproc_block_size = 64; +constexpr int rolling_buf_size = decode_block_size * 2; +constexpr int preproc_buf_size = LEVEL_DECODE_BUF_SIZE; /** * @brief Compute the start and end page value bounds for this page @@ -136,6 +141,25 @@ __device__ thrust::pair page_bounds(page_state_s* const s, bool skipped_values_set = false; bool end_value_set = false; + // If page_start_row >= min_row, then skipped_values is 0 and we don't have to search for + // start_value. If there's repetition then we've already calculated + // skipped_values/skipped_leaf_values. + // TODO(ets): If we hit this condition, and end_row > last row in page, then we can skip + // more of the processing below. + if (has_repetition or page_start_row >= min_row) { + if (t == 0) { + if (has_repetition) { + skipped_values = pp->skipped_values; + skipped_leaf_values = pp->skipped_leaf_values; + } else { + skipped_values = 0; + skipped_leaf_values = 0; + } + } + skipped_values_set = true; + __syncthreads(); + } + while (processed < s->page.num_input_values) { thread_index_type start_val = processed; @@ -145,11 +169,6 @@ __device__ thrust::pair page_bounds(page_state_s* const s, // special case where page does not begin at a row boundary if (processed == 0 && rep_decode[0] != 0) { - if (t == 0) { - skipped_values = 0; - skipped_leaf_values = 0; - } - skipped_values_set = true; end_row++; // need to finish off the previous row row_fudge = 0; } @@ -453,12 +472,107 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data, } /** - * @brief Kernel for computing string page output size information. + * @brief Compute string size information for DELTA_BYTE_ARRAY encoded strings. + * + * This traverses the packed prefix and suffix lengths, summing them to obtain the total + * number of bytes needed for the decoded string data. It also calculates an upper bound + * for the largest string length to obtain an upper bound on temporary space needed if + * rows will be skipped. + * + * Called with 64 threads. + * + * @param data Pointer to the start of the page data stream + * @param end Pointer to the end of the page data stream + * @param start_value Do not count values that occur before this index + * @param end_value Do not count values that occur after this index + * @return A pair of `size_t` values representing the total string size and temp buffer size + * required for decoding + */ +__device__ thrust::pair totalDeltaByteArraySize(uint8_t const* data, + uint8_t const* end, + int start_value, + int end_value) +{ + using cudf::detail::warp_size; + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage[2]; + + __shared__ __align__(16) delta_binary_decoder prefixes; + __shared__ __align__(16) delta_binary_decoder suffixes; + + int const t = threadIdx.x; + int const lane_id = t % warp_size; + int const warp_id = t / warp_size; + + if (t == 0) { + auto const* suffix_start = prefixes.find_end_of_block(data, end); + suffixes.init_binary_block(suffix_start, end); + } + __syncthreads(); + + // two warps will traverse the prefixes and suffixes and sum them up + auto const db = t < warp_size ? &prefixes : t < 2 * warp_size ? &suffixes : nullptr; + + size_t total_bytes = 0; + uleb128_t max_len = 0; + + if (db != nullptr) { + // initialize with first value (which is stored in last_value) + if (lane_id == 0 && start_value == 0) { total_bytes = db->last_value; } + + uleb128_t lane_sum = 0; + uleb128_t lane_max = 0; + while (db->current_value_idx < end_value && + db->current_value_idx < db->num_encoded_values(true)) { + // calculate values for current mini-block + db->calc_mini_block_values(lane_id); + + // get per lane sum for mini-block + for (uint32_t i = 0; i < db->values_per_mb; i += 32) { + uint32_t const idx = db->current_value_idx + i + lane_id; + if (idx >= start_value && idx < end_value && idx < db->value_count) { + lane_sum += db->value[rolling_index(idx)]; + lane_max = max(lane_max, db->value[rolling_index(idx)]); + } + } + + if (lane_id == 0) { db->setup_next_mini_block(true); } + __syncwarp(); + } + + // get sum for warp. + // note: warp_sum will only be valid on lane 0. + auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum); + auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max()); + + if (lane_id == 0) { + total_bytes += warp_sum; + max_len = warp_max; + } + } + __syncthreads(); + + // now sum up total_bytes from the two warps + auto const final_bytes = + cudf::detail::single_lane_block_sum_reduce(total_bytes); + + // Sum up prefix and suffix max lengths to get a max possible string length. Multiply that + // by the number of strings in a mini-block, plus one to save the last string. + auto const temp_bytes = + cudf::detail::single_lane_block_sum_reduce(max_len) * + (db->values_per_mb + 1); + + return {final_bytes, temp_bytes}; +} + +/** + * @brief Kernel for computing string page bounds information. * - * String columns need accurate data size information to preallocate memory in the column buffer to - * store the char data. This calls a kernel to calculate information needed by the string decoding - * kernel. On exit, the `str_bytes`, `num_nulls`, and `num_valids` fields of the PageInfo struct - * are updated. This call ignores non-string columns. + * This kernel traverses the repetition and definition level data to determine start and end values + * for pages with string-like data. Also calculates the number of null and valid values in the + * page. Does nothing if the page mask is neither `STRING` nor `DELTA_BYTE_ARRAY`. On exit the + * `num_nulls`, `num_valids`, `start_val` and `end_val` fields of the `PageInfo` struct will be + * populated. * * @param pages All pages to be decoded * @param chunks All chunks to be decoded @@ -467,7 +581,7 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data, * @tparam level_t Type used to store decoded repetition and definition levels */ template -__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes( +__global__ void __launch_bounds__(preprocess_block_size) gpuComputeStringPageBounds( PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) { __shared__ __align__(16) page_state_s state_g; @@ -477,8 +591,13 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz int const t = threadIdx.x; PageInfo* const pp = &pages[page_idx]; - // reset str_bytes to 0 in case it's already been calculated - if (t == 0) { pp->str_bytes = 0; } + if (t == 0) { + s->page.num_nulls = 0; + s->page.num_valids = 0; + // reset str_bytes to 0 in case it's already been calculated (esp needed for chunked reads). + // TODO: need to rethink this once str_bytes is in the statistics + pp->str_bytes = 0; + } // whether or not we have repetition levels (lists) bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; @@ -494,23 +613,11 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz {rep_runs}}; // setup page info - if (!setupLocalPageInfo( - s, pp, chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, false)) { - return; - } - - if (!t) { - s->page.num_nulls = 0; - s->page.num_valids = 0; - s->page.str_bytes = 0; - } - __syncthreads(); + auto const mask = BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY); + if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; } bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition); - // if we're skipping this page anyway, no need to count it - if (!is_bounds_pg && !is_page_contained(s, min_row, num_rows)) { return; } - // find start/end value indices auto const [start_value, end_value] = page_bounds(s, min_row, num_rows, is_bounds_pg, has_repetition, decoders); @@ -519,8 +626,107 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz if (t == 0) { pp->num_nulls = s->page.num_nulls; pp->num_valids = s->page.num_valids; + pp->start_val = start_value; + pp->end_val = end_value; + } +} + +/** + * @brief Kernel for computing string page output size information for delta_byte_array encoding. + * + * This call ignores columns that are not DELTA_BYTE_ARRAY encoded. On exit the `str_bytes` field + * of the `PageInfo` struct will be populated. Also fills in the `temp_string_size` field if rows + * are to be skipped. + * + * @param pages All pages to be decoded + * @param chunks All chunks to be decoded + * @param min_rows crop all rows below min_row + * @param num_rows Maximum number of rows to read + */ +__global__ void __launch_bounds__(delta_preproc_block_size) gpuComputeDeltaPageStringSizes( + PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) +{ + __shared__ __align__(16) page_state_s state_g; + + page_state_s* const s = &state_g; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; + PageInfo* const pp = &pages[page_idx]; + + // whether or not we have repetition levels (lists) + bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; + + // setup page info + auto const mask = decode_kernel_mask::DELTA_BYTE_ARRAY; + if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, mask_filter{mask}, true)) { return; } + + auto const start_value = pp->start_val; + + // if data size is known, can short circuit here + if ((chunks[pp->chunk_idx].data_type & 7) == FIXED_LEN_BYTE_ARRAY) { + if (t == 0) { + pp->str_bytes = pp->num_valids * s->dtype_len_in; + + // only need temp space if we're skipping values + if (start_value > 0) { + // just need to parse the header of the first delta binary block to get values_per_mb + delta_binary_decoder db; + db.init_binary_block(s->data_start, s->data_end); + // save enough for one mini-block plus some extra to save the last_string + pp->temp_string_size = s->dtype_len_in * (db.values_per_mb + 1); + } + } + } else { + // now process string info in the range [start_value, end_value) + // set up for decoding strings...can be either plain or dictionary + uint8_t const* data = s->data_start; + uint8_t const* const end = s->data_end; + auto const end_value = pp->end_val; + + auto const [len, temp_bytes] = totalDeltaByteArraySize(data, end, start_value, end_value); + + if (t == 0) { + // TODO check for overflow + pp->str_bytes = len; + + // only need temp space if we're skipping values + if (start_value > 0) { pp->temp_string_size = temp_bytes; } + } + } +} + +/** + * @brief Kernel for computing string page output size information. + * + * This call ignores non-string columns. On exit the `str_bytes` field of the `PageInfo` struct will + * be populated. + * + * @param pages All pages to be decoded + * @param chunks All chunks to be decoded + * @param min_rows crop all rows below min_row + * @param num_rows Maximum number of rows to read + */ +__global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSizes( + PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows) +{ + __shared__ __align__(16) page_state_s state_g; + + page_state_s* const s = &state_g; + int const page_idx = blockIdx.x; + int const t = threadIdx.x; + PageInfo* const pp = &pages[page_idx]; + + // whether or not we have repetition levels (lists) + bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; + + // setup page info + if (!setupLocalPageInfo( + s, pp, chunks, min_row, num_rows, mask_filter{decode_kernel_mask::STRING}, true)) { + return; } + bool const is_bounds_pg = is_bounds_page(s, min_row, num_rows, has_repetition); + auto const& col = s->col; size_t str_bytes = 0; // short circuit for FIXED_LEN_BYTE_ARRAY @@ -533,6 +739,8 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz uint8_t const* const end = s->data_end; uint8_t const* dict_base = nullptr; int dict_size = 0; + auto const start_value = pp->start_val; + auto const end_value = pp->end_val; switch (pp->encoding) { case Encoding::PLAIN_DICTIONARY: @@ -564,6 +772,9 @@ __global__ void __launch_bounds__(preprocess_block_size) gpuComputePageStringSiz if (t == 0) { // TODO check for overflow pp->str_bytes = str_bytes; + + // only need temp space for delta + pp->temp_string_size = 0; } } @@ -589,6 +800,7 @@ __global__ void __launch_bounds__(decode_block_size) size_t num_rows, int32_t* error_code) { + using cudf::detail::warp_size; __shared__ __align__(16) page_state_s state_g; __shared__ __align__(4) size_type last_offset; __shared__ __align__(16) @@ -599,10 +811,12 @@ __global__ void __launch_bounds__(decode_block_size) auto* const sb = &state_buffers; int const page_idx = blockIdx.x; int const t = threadIdx.x; + int const lane_id = t % warp_size; [[maybe_unused]] null_count_back_copier _{s, t}; + auto const mask = decode_kernel_mask::STRING; if (!setupLocalPageInfo( - s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{KERNEL_MASK_STRING}, true)) { + s, &pages[page_idx], chunks, min_row, num_rows, mask_filter{mask}, true)) { return; } @@ -633,6 +847,7 @@ __global__ void __launch_bounds__(decode_block_size) target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0); if (out_thread0 > 32) { target_pos = min(target_pos, s->dict_pos); } } + // TODO(ets): see if this sync can be removed __syncthreads(); if (t < 32) { // decode repetition and definition levels. @@ -646,9 +861,9 @@ __global__ void __launch_bounds__(decode_block_size) // WARP1: Decode dictionary indices, booleans or string positions if (s->dict_base) { - src_target_pos = gpuDecodeDictionaryIndices(s, sb, src_target_pos, t & 0x1f).first; + src_target_pos = gpuDecodeDictionaryIndices(s, sb, src_target_pos, lane_id).first; } else { - gpuInitStringDescriptors(s, sb, src_target_pos, t & 0x1f); + gpuInitStringDescriptors(s, sb, src_target_pos, lane_id); } if (t == 32) { *(volatile int32_t*)&s->dict_pos = src_target_pos; } } else { @@ -748,37 +963,108 @@ __global__ void __launch_bounds__(decode_block_size) auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); block_excl_sum(offptr, value_count, s->page.str_offset); - if (t == 0 and s->error != 0) { - cuda::atomic_ref ref{*error_code}; - ref.fetch_or(s->error, cuda::std::memory_order_relaxed); - } + if (t == 0 and s->error != 0) { set_error(s->error, error_code); } } +// Functor used to set the `temp_string_buf` pointer for each page. `data` points to a buffer +// to be used when skipping rows in the delta_byte_array decoder. Given a page and an offset, +// set the page's `temp_string_buf` to be `data + offset`. +struct page_tform_functor { + uint8_t* const data; + + __device__ PageInfo operator()(PageInfo& page, int64_t offset) + { + if (page.temp_string_size != 0) { page.temp_string_buf = data + offset; } + return page; + } +}; + } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes + * @copydoc cudf::io::parquet::detail::ComputePageStringSizes */ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, + rmm::device_uvector& temp_string_buf, size_t min_row, size_t num_rows, int level_type_size, + uint32_t kernel_mask, rmm::cuda_stream_view stream) { - dim3 dim_block(preprocess_block_size, 1); - dim3 dim_grid(pages.size(), 1); // 1 threadblock per page + dim3 const dim_block(preprocess_block_size, 1); + dim3 const dim_grid(pages.size(), 1); // 1 threadblock per page if (level_type_size == 1) { - gpuComputePageStringSizes + gpuComputeStringPageBounds <<>>(pages.device_ptr(), chunks, min_row, num_rows); } else { - gpuComputePageStringSizes + gpuComputeStringPageBounds <<>>(pages.device_ptr(), chunks, min_row, num_rows); } + + // kernel mask may contain other kernels we don't need to count + int const count_mask = + kernel_mask & BitOr(decode_kernel_mask::DELTA_BYTE_ARRAY, decode_kernel_mask::STRING); + int const nkernels = std::bitset<32>(count_mask).count(); + auto const streams = cudf::detail::fork_streams(stream, nkernels); + + int s_idx = 0; + if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) { + dim3 dim_delta(delta_preproc_block_size, 1); + gpuComputeDeltaPageStringSizes<<>>( + pages.device_ptr(), chunks, min_row, num_rows); + } + if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) { + gpuComputePageStringSizes<<>>( + pages.device_ptr(), chunks, min_row, num_rows); + } + + // synchronize the streams + cudf::detail::join_streams(streams, stream); + + // check for needed temp space for DELTA_BYTE_ARRAY + auto const need_sizes = thrust::any_of( + rmm::exec_policy(stream), pages.d_begin(), pages.d_end(), [] __device__(auto& page) { + return page.temp_string_size != 0; + }); + + if (need_sizes) { + // sum up all of the temp_string_sizes + auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; }; + auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream), + pages.d_begin(), + pages.d_end(), + page_sizes, + 0L, + thrust::plus{}); + + // now do an exclusive scan over the temp_string_sizes to get offsets for each + // page's chunk of the temp buffer + rmm::device_uvector page_string_offsets(pages.size(), stream); + thrust::transform_exclusive_scan(rmm::exec_policy_nosync(stream), + pages.d_begin(), + pages.d_end(), + page_string_offsets.begin(), + page_sizes, + 0L, + thrust::plus{}); + + // allocate the temp space + temp_string_buf.resize(total_size, stream); + + // now use the offsets array to set each page's temp_string_buf pointers + thrust::transform(rmm::exec_policy_nosync(stream), + pages.d_begin(), + pages.d_end(), + page_string_offsets.begin(), + pages.d_begin(), + page_tform_functor{temp_string_buf.data()}); + } } /** - * @copydoc cudf::io::parquet::gpu::DecodeStringPageData + * @copydoc cudf::io::parquet::detail::DecodeStringPageData */ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -802,7 +1088,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pa } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index 9395599b3ff..a81d0a64466 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -18,7 +18,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. // copies from src to dst in 16B chunks per thread. @@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index c2affc774c2..9ab686b99d5 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -18,6 +18,8 @@ #include "parquet_common.hpp" +#include + #include #include @@ -25,9 +27,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24)); /** @@ -45,79 +46,102 @@ struct file_ender_s { uint32_t magic; }; -// thrift generated code simplified. -struct StringType {}; -struct MapType {}; -struct ListType {}; -struct EnumType {}; +// thrift inspired code simplified. struct DecimalType { int32_t scale = 0; int32_t precision = 0; }; -struct DateType {}; - -struct MilliSeconds {}; -struct MicroSeconds {}; -struct NanoSeconds {}; -using TimeUnit_isset = struct TimeUnit_isset { - bool MILLIS{false}; - bool MICROS{false}; - bool NANOS{false}; -}; struct TimeUnit { - TimeUnit_isset isset; - MilliSeconds MILLIS; - MicroSeconds MICROS; - NanoSeconds NANOS; + enum Type { UNDEFINED, MILLIS, MICROS, NANOS }; + Type type; }; struct TimeType { - bool isAdjustedToUTC = false; - TimeUnit unit; + // Default to true because the timestamps are implicitly in UTC + // Writer option overrides this default + bool isAdjustedToUTC = true; + TimeUnit unit = {TimeUnit::MILLIS}; }; + struct TimestampType { - bool isAdjustedToUTC = false; - TimeUnit unit; + // Default to true because the timestamps are implicitly in UTC + // Writer option overrides this default + bool isAdjustedToUTC = true; + TimeUnit unit = {TimeUnit::MILLIS}; }; + struct IntType { int8_t bitWidth = 0; bool isSigned = false; }; -struct NullType {}; -struct JsonType {}; -struct BsonType {}; - -// thrift generated code simplified. -using LogicalType_isset = struct LogicalType_isset { - bool STRING{false}; - bool MAP{false}; - bool LIST{false}; - bool ENUM{false}; - bool DECIMAL{false}; - bool DATE{false}; - bool TIME{false}; - bool TIMESTAMP{false}; - bool INTEGER{false}; - bool UNKNOWN{false}; - bool JSON{false}; - bool BSON{false}; -}; struct LogicalType { - LogicalType_isset isset; - StringType STRING; - MapType MAP; - ListType LIST; - EnumType ENUM; - DecimalType DECIMAL; - DateType DATE; - TimeType TIME; - TimestampType TIMESTAMP; - IntType INTEGER; - NullType UNKNOWN; - JsonType JSON; - BsonType BSON; + enum Type { + UNDEFINED, + STRING, + MAP, + LIST, + ENUM, + DECIMAL, + DATE, + TIME, + TIMESTAMP, + // 9 is reserved + INTEGER = 10, + UNKNOWN, + JSON, + BSON + }; + Type type; + thrust::optional decimal_type; + thrust::optional time_type; + thrust::optional timestamp_type; + thrust::optional int_type; + + LogicalType(Type tp = UNDEFINED) : type(tp) {} + LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} + LogicalType(TimeType&& tt) : type(TIME), time_type(tt) {} + LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {} + LogicalType(IntType&& it) : type(INTEGER), int_type(it) {} + + constexpr bool is_time_millis() const + { + return type == TIME and time_type->unit.type == TimeUnit::MILLIS; + } + + constexpr bool is_time_micros() const + { + return type == TIME and time_type->unit.type == TimeUnit::MICROS; + } + + constexpr bool is_time_nanos() const + { + return type == TIME and time_type->unit.type == TimeUnit::NANOS; + } + + constexpr bool is_timestamp_millis() const + { + return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS; + } + + constexpr bool is_timestamp_micros() const + { + return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS; + } + + constexpr bool is_timestamp_nanos() const + { + return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS; + } + + constexpr int8_t bit_width() const { return type == INTEGER ? int_type->bitWidth : -1; } + + constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; } + + constexpr int32_t scale() const { return type == DECIMAL ? decimal_type->scale : -1; } + + constexpr int32_t precision() const { return type == DECIMAL ? decimal_type->precision : -1; } }; /** @@ -126,8 +150,6 @@ struct LogicalType { struct ColumnOrder { enum Type { UNDEFINED, TYPE_ORDER }; Type type; - - operator Type() const { return type; } }; /** @@ -137,24 +159,35 @@ struct ColumnOrder { * as a schema tree. */ struct SchemaElement { - Type type = UNDEFINED_TYPE; - ConvertedType converted_type = UNKNOWN; - LogicalType logical_type; - int32_t type_length = - 0; // Byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types + // 1: parquet physical type for output + Type type = UNDEFINED_TYPE; + // 2: byte length of FIXED_LENGTH_BYTE_ARRAY elements, or maximum bit length for other types + int32_t type_length = 0; + // 3: repetition of the field FieldRepetitionType repetition_type = REQUIRED; - std::string name = ""; - int32_t num_children = 0; - int32_t decimal_scale = 0; - int32_t decimal_precision = 0; - thrust::optional field_id = thrust::nullopt; - bool output_as_byte_array = false; + // 4: name of the field + std::string name = ""; + // 5: nested fields + int32_t num_children = 0; + // 6: DEPRECATED: record the original type before conversion to parquet type + thrust::optional converted_type; + // 7: DEPRECATED: record the scale for DECIMAL converted type + int32_t decimal_scale = 0; + // 8: DEPRECATED: record the precision for DECIMAL converted type + int32_t decimal_precision = 0; + // 9: save field_id from original schema + thrust::optional field_id; + // 10: replaces converted type + thrust::optional logical_type; + + // extra cudf specific fields + bool output_as_byte_array = false; // The following fields are filled in later during schema initialization int max_definition_level = 0; int max_repetition_level = 0; - int parent_idx = 0; - std::vector children_idx; + size_type parent_idx = 0; + std::vector children_idx; bool operator==(SchemaElement const& other) const { @@ -206,7 +239,7 @@ struct SchemaElement { { return type == UNDEFINED_TYPE && // this assumption might be a little weak. - ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children == 2)); + ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children > 1)); } }; @@ -214,12 +247,18 @@ struct SchemaElement { * @brief Thrift-derived struct describing column chunk statistics */ struct Statistics { - std::vector max; // deprecated max value in signed comparison order - std::vector min; // deprecated min value in signed comparison order - int64_t null_count = -1; // count of null values in the column - int64_t distinct_count = -1; // count of distinct values occurring - std::vector max_value; // max value for column determined by ColumnOrder - std::vector min_value; // min value for column determined by ColumnOrder + // deprecated max value in signed comparison order + thrust::optional> max; + // deprecated min value in signed comparison order + thrust::optional> min; + // count of null values in the column + thrust::optional null_count; + // count of distinct values occurring + thrust::optional distinct_count; + // max value for column determined by ColumnOrder + thrust::optional> max_value; + // min value for column determined by ColumnOrder + thrust::optional> min_value; }; /** @@ -405,6 +444,4 @@ static inline int CountLeadingZeros32(uint32_t value) #endif } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5a1716bb547..50736197eb9 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -18,9 +18,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + // Max decimal precisions according to the parquet spec: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal auto constexpr MAX_DECIMAL32_PRECISION = 9; @@ -156,6 +155,4 @@ enum FieldType { ST_FLD_STRUCT = 12, }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index dc74bee1536..10e12ebb782 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -23,7 +23,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; @@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx, return idx; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 51c862b376b..129d4e4d28c 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -31,11 +31,14 @@ #include #include +#include + #include +#include #include -namespace cudf::io::parquet { +namespace cudf::io::parquet::detail { using cudf::io::detail::string_index_pair; @@ -54,19 +57,46 @@ constexpr int rolling_index(int index) return index % rolling_size; } +// see setupLocalPageInfo() in page_decode.cuh for supported page encodings +constexpr bool is_supported_encoding(Encoding enc) +{ + switch (enc) { + case Encoding::PLAIN: + case Encoding::PLAIN_DICTIONARY: + case Encoding::RLE: + case Encoding::RLE_DICTIONARY: + case Encoding::DELTA_BINARY_PACKED: + case Encoding::DELTA_BYTE_ARRAY: return true; + default: return false; + } +} + +/** + * @brief Atomically OR `error` into `error_code`. + */ +constexpr void set_error(int32_t error, int32_t* error_code) +{ + if (error != 0) { + cuda::atomic_ref ref{*error_code}; + ref.fetch_or(error, cuda::std::memory_order_relaxed); + } +} + /** * @brief Enum for the different types of errors that can occur during decoding. * * These values are used as bitmasks, so they must be powers of 2. */ enum class decode_error : int32_t { - DATA_STREAM_OVERRUN = 0x1, - LEVEL_STREAM_OVERRUN = 0x2, - UNSUPPORTED_ENCODING = 0x4, - INVALID_LEVEL_RUN = 0x8, - INVALID_DATA_TYPE = 0x10, - EMPTY_PAGE = 0x20, - INVALID_DICT_WIDTH = 0x40, + DATA_STREAM_OVERRUN = 0x1, + LEVEL_STREAM_OVERRUN = 0x2, + UNSUPPORTED_ENCODING = 0x4, + INVALID_LEVEL_RUN = 0x8, + INVALID_DATA_TYPE = 0x10, + EMPTY_PAGE = 0x20, + INVALID_DICT_WIDTH = 0x40, + DELTA_PARAM_MISMATCH = 0x80, + DELTA_PARAMS_UNSUPPORTED = 0x100, }; /** @@ -88,7 +118,47 @@ struct input_column_info { auto nesting_depth() const { return nesting.size(); } }; -namespace gpu { +// The delta encodings use ULEB128 integers, but parquet only uses max 64 bits. +using uleb128_t = uint64_t; +using zigzag128_t = int64_t; + +// this is in C++23 +#if !defined(__cpp_lib_is_scoped_enum) +template > +struct is_scoped_enum { + static const bool value = not std::is_convertible_v>; +}; + +template +struct is_scoped_enum { + static const bool value = false; +}; +#else +using std::is_scoped_enum; +#endif + +// helpers to do bit operations on scoped enums +template ::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v)>* = + nullptr> +constexpr uint32_t BitAnd(T1 a, T2 b) +{ + return static_cast(a) & static_cast(b); +} + +template ::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v) or + (is_scoped_enum::value and std::is_same_v)>* = + nullptr> +constexpr uint32_t BitOr(T1 a, T2 b) +{ + return static_cast(a) | static_cast(b); +} /** * @brief Enums for the flags in the page header @@ -113,10 +183,12 @@ enum level_type { * * Used to control which decode kernels to run. */ -enum kernel_mask_bits { - KERNEL_MASK_GENERAL = (1 << 0), // Run catch-all decode kernel - KERNEL_MASK_STRING = (1 << 1), // Run decode kernel for string data - KERNEL_MASK_DELTA_BINARY = (1 << 2) // Run decode kernel for DELTA_BINARY_PACKED data +enum class decode_kernel_mask { + NONE = 0, + GENERAL = (1 << 0), // Run catch-all decode kernel + STRING = (1 << 1), // Run decode kernel for string data + DELTA_BINARY = (1 << 2), // Run decode kernel for DELTA_BINARY_PACKED data + DELTA_BYTE_ARRAY = (1 << 3) // Run decode kernel for DELTA_BYTE_ARRAY encoded data }; /** @@ -197,9 +269,11 @@ struct PageInfo { int32_t num_input_values; int32_t chunk_row; // starting row of this page relative to the start of the chunk int32_t num_rows; // number of rows in this page - // the next two are calculated in gpuComputePageStringSizes + // the next four are calculated in gpuComputePageStringSizes int32_t num_nulls; // number of null values (V2 header), but recalculated for string cols int32_t num_valids; // number of non-null values, taking into account skip_rows/num_rows + int32_t start_val; // index of first value of the string data stream to use + int32_t end_val; // index of last value in string data stream int32_t chunk_idx; // column chunk this page belongs to int32_t src_col_schema; // schema index of this column uint8_t flags; // PAGEINFO_FLAGS_XXX @@ -236,7 +310,11 @@ struct PageInfo { // level decode buffers uint8_t* lvl_decode_buf[level_type::NUM_LEVEL_TYPES]; - uint32_t kernel_mask; + // temporary space for decoding DELTA_BYTE_ARRAY encoded strings + int64_t temp_string_size; + uint8_t* temp_string_buf; + + decode_kernel_mask kernel_mask; }; /** @@ -258,7 +336,7 @@ struct ColumnChunkDesc { uint8_t rep_level_bits_, int8_t codec_, int8_t converted_type_, - LogicalType logical_type_, + thrust::optional logical_type_, int8_t decimal_precision_, int32_t ts_clock_rate_, int32_t src_col_index_, @@ -300,99 +378,26 @@ struct ColumnChunkDesc { uint16_t data_type{}; // basic column data type, ((type_length << 3) | // parquet::Type) uint8_t - level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels - int32_t num_data_pages{}; // number of data pages - int32_t num_dict_pages{}; // number of dictionary pages - int32_t max_num_pages{}; // size of page_info array - PageInfo* page_info{}; // output page info for up to num_dict_pages + - // num_data_pages (dictionary pages first) - string_index_pair* str_dict_index{}; // index for string dictionary - bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column - void** column_data_base{}; // base pointers of column data - void** column_string_base{}; // base pointers of column string data - int8_t codec{}; // compressed codec enum - int8_t converted_type{}; // converted type enum - LogicalType logical_type{}; // logical type - int8_t decimal_precision{}; // Decimal precision + level_bits[level_type::NUM_LEVEL_TYPES]{}; // bits to encode max definition/repetition levels + int32_t num_data_pages{}; // number of data pages + int32_t num_dict_pages{}; // number of dictionary pages + int32_t max_num_pages{}; // size of page_info array + PageInfo* page_info{}; // output page info for up to num_dict_pages + + // num_data_pages (dictionary pages first) + string_index_pair* str_dict_index{}; // index for string dictionary + bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column + void** column_data_base{}; // base pointers of column data + void** column_string_base{}; // base pointers of column string data + int8_t codec{}; // compressed codec enum + int8_t converted_type{}; // converted type enum + thrust::optional logical_type{}; // logical type + int8_t decimal_precision{}; // Decimal precision int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index int32_t src_col_schema{}; // my schema index in the file }; -/** - * @brief The row_group_info class - */ -struct row_group_info { - size_type index; // row group index within a file. aggregate_reader_metadata::get_row_group() is - // called with index and source_index - size_t start_row; - size_type source_index; // file index. - - row_group_info() = default; - - row_group_info(size_type index, size_t start_row, size_type source_index) - : index{index}, start_row{start_row}, source_index{source_index} - { - } -}; - -/** - * @brief Struct to store file-level data that remains constant for - * all passes/chunks for the file. - */ -struct file_intermediate_data { - // all row groups to read - std::vector row_groups{}; - - // all chunks from the selected row groups. We may end up reading these chunks progressively - // instead of all at once - std::vector chunks{}; - - // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we - // may not be visiting every row group that contains these bounds - size_t global_skip_rows; - size_t global_num_rows; -}; - -/** - * @brief Structs to identify the reading row range for each chunk of rows in chunked reading. - */ -struct chunk_read_info { - size_t skip_rows; - size_t num_rows; -}; - -/** - * @brief Struct to store pass-level data that remains constant for a single pass. - */ -struct pass_intermediate_data { - std::vector> raw_page_data; - rmm::device_buffer decomp_page_data; - - // rowgroup, chunk and page information for the current pass. - std::vector row_groups{}; - cudf::detail::hostdevice_vector chunks{}; - cudf::detail::hostdevice_vector pages_info{}; - cudf::detail::hostdevice_vector page_nesting_info{}; - cudf::detail::hostdevice_vector page_nesting_decode_info{}; - - rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; - rmm::device_uvector page_index{0, rmm::cuda_stream_default}; - rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; - - std::vector output_chunk_read_info; - std::size_t current_output_chunk{0}; - - rmm::device_buffer level_decode_data{}; - int level_type_size{0}; - - // skip_rows and num_rows values for this particular pass. these may be adjusted values from the - // global values stored in file_intermediate_data. - size_t skip_rows; - size_t num_rows; -}; - /** * @brief Struct describing an encoder column */ @@ -446,6 +451,17 @@ constexpr uint32_t encoding_to_mask(Encoding encoding) return 1 << static_cast(encoding); } +/** + * @brief Enum of mask bits for the EncPage kernel_mask + * + * Used to control which encode kernels to run. + */ +enum class encode_kernel_mask { + PLAIN = (1 << 0), // Run plain encoding kernel + DICTIONARY = (1 << 1), // Run dictionary encoding kernel + DELTA_BINARY = (1 << 2) // Run DELTA_BINARY_PACKED encoding kernel +}; + /** * @brief Struct describing an encoder column chunk */ @@ -504,10 +520,11 @@ struct EncPage { uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) - uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) - compression_result* comp_res; //!< Ptr to compression result - uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) + uint32_t def_lvl_bytes; //!< Number of bytes of encoded definition level data (V2 only) + uint32_t rep_lvl_bytes; //!< Number of bytes of encoded repetition level data (V2 only) + compression_result* comp_res; //!< Ptr to compression result + uint32_t num_nulls; //!< Number of null values (V2 only) (down here for alignment) + encode_kernel_mask kernel_mask; //!< Mask used to control which encoding kernels to run }; /** @@ -527,9 +544,13 @@ constexpr bool is_string_col(ColumnChunkDesc const& chunk) * * @param[in] chunks List of column chunks * @param[in] num_chunks Number of column chunks + * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ -void DecodePageHeaders(ColumnChunkDesc* chunks, int32_t num_chunks, rmm::cuda_stream_view stream); +void DecodePageHeaders(ColumnChunkDesc* chunks, + int32_t num_chunks, + int32_t* error_code, + rmm::cuda_stream_view stream); /** * @brief Launches kernel for building the dictionary index for the column @@ -599,16 +620,20 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, * * @param[in,out] pages All pages to be decoded * @param[in] chunks All chunks to be decoded + * @param[out] temp_string_buf Temporary space needed for decoding DELTA_BYTE_ARRAY strings * @param[in] min_rows crop all rows below min_row * @param[in] num_rows Maximum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[in] kernel_mask Mask of kernels to run * @param[in] stream CUDA stream to use */ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, + rmm::device_uvector& temp_string_buf, size_t min_row, size_t num_rows, int level_type_size, + uint32_t kernel_mask, rmm::cuda_stream_view stream); /** @@ -667,7 +692,7 @@ void DecodeStringPageData(cudf::detail::hostdevice_vector& pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[out] error_code Error code for kernel failures - * @param[in] stream CUDA stream to use, default 0 + * @param[in] stream CUDA stream to use */ void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -677,6 +702,28 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, int32_t* error_code, rmm::cuda_stream_view stream); +/** + * @brief Launches kernel for reading the DELTA_BYTE_ARRAY column data stored in the pages + * + * The page data will be written to the output pointed to in the page's + * associated column chunk. + * + * @param[in,out] pages All pages to be decoded + * @param[in] chunks All chunks to be decoded + * @param[in] num_rows Total number of rows to read + * @param[in] min_row Minimum number of rows to read + * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[out] error_code Error code for kernel failures + * @param[in] stream CUDA stream to use + */ +void DecodeDeltaByteArray(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, + size_t num_rows, + size_t min_row, + int level_type_size, + int32_t* error_code, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for initializing encoder row group fragments * @@ -739,7 +786,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -762,7 +809,7 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -781,7 +828,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan * @param[in] stream CUDA stream to use */ void InitEncoderPages(cudf::detail::device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -847,7 +894,7 @@ void EncodePageHeaders(device_span pages, * @param[in] stream CUDA stream to use */ void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream); /** @@ -863,5 +910,4 @@ void EncodeColumnIndexes(device_span chunks, int32_t column_index_truncate_length, rmm::cuda_stream_view stream); -} // namespace gpu -} // namespace cudf::io::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 805d082c71e..a5851de3c20 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { /** @@ -62,13 +62,13 @@ struct stats_caster { // uses storage type as T template () or cudf::is_nested())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_FAIL("unsupported type for stats casting"); } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); return targetType(*reinterpret_cast(stats_val)); @@ -78,7 +78,7 @@ struct stats_caster { template () and !cudf::is_boolean()) or cudf::is_fixed_point() or cudf::is_chrono())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case INT32: return targetType(*reinterpret_cast(stats_val)); @@ -103,7 +103,7 @@ struct stats_caster { } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case FLOAT: return targetType(*reinterpret_cast(stats_val)); @@ -113,7 +113,7 @@ struct stats_caster { } template )> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case BYTE_ARRAY: [[fallthrough]]; @@ -150,12 +150,14 @@ struct stats_caster { { } - void set_index(size_type index, std::vector const& binary_value, Type const type) + void set_index(size_type index, + thrust::optional> const& binary_value, + Type const type) { - if (!binary_value.empty()) { - val[index] = convert(binary_value.data(), binary_value.size(), type); + if (binary_value.has_value()) { + val[index] = convert(binary_value.value().data(), binary_value.value().size(), type); } - if (binary_value.empty()) { + if (not binary_value.has_value()) { clear_bit_unsafe(null_mask.data(), index); null_count++; } @@ -210,10 +212,10 @@ struct stats_caster { auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx]; auto const& colchunk = row_group.columns[col_idx]; // To support deprecated min, max fields. - auto const& min_value = colchunk.meta_data.statistics.min_value.size() > 0 + auto const& min_value = colchunk.meta_data.statistics.min_value.has_value() ? colchunk.meta_data.statistics.min_value : colchunk.meta_data.statistics.min; - auto const& max_value = colchunk.meta_data.statistics.max_value.size() > 0 + auto const& max_value = colchunk.meta_data.statistics.max_value.has_value() ? colchunk.meta_data.statistics.max_value : colchunk.meta_data.statistics.max; // translate binary data to Type then to @@ -527,4 +529,4 @@ named_to_reference_converter::visit_operands( return transformed_operands; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index 1e87447006d..17d7c07bc91 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,7 +16,7 @@ #include "reader_impl.hpp" -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { reader::reader() = default; @@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); } table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index ea40f29a070..6e799424d01 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -15,30 +15,35 @@ */ #include "reader_impl.hpp" +#include "error.hpp" #include #include #include #include -#include #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) { - auto& chunks = _pass_itm_data->chunks; - auto& pages = _pass_itm_data->pages_info; - auto& page_nesting = _pass_itm_data->page_nesting_info; - auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info; + auto& chunks = _pass_itm_data->chunks; + auto& pages = _pass_itm_data->pages_info; + auto& page_nesting = _pass_itm_data->page_nesting_info; + auto& page_nesting_decode = _pass_itm_data->page_nesting_decode_info; + auto const level_type_size = _pass_itm_data->level_type_size; + + // temporary space for DELTA_BYTE_ARRAY decoding. this only needs to live until + // gpu::DecodeDeltaByteArray returns. + rmm::device_uvector delta_temp_buf(0, _stream); // Should not reach here if there is no page data. CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); size_t const sum_max_depths = std::accumulate( - chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { + chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) { return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); @@ -51,11 +56,12 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // doing a gather operation later on. // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). - auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0; + auto const has_strings = + (kernel_mask & BitOr(decode_kernel_mask::STRING, decode_kernel_mask::DELTA_BYTE_ARRAY)) != 0; std::vector col_sizes(_input_columns.size(), 0L); if (has_strings) { - gpu::ComputePageStringSizes( - pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream); + ComputePageStringSizes( + pages, chunks, delta_temp_buf, skip_rows, num_rows, level_type_size, kernel_mask, _stream); col_sizes = calculate_page_string_offsets(); @@ -162,33 +168,37 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) chunks.host_to_device_async(_stream); chunk_nested_valids.host_to_device_async(_stream); chunk_nested_data.host_to_device_async(_stream); + if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); } - rmm::device_scalar error_code(0, _stream); + // create this before we fork streams + kernel_error error_code(_stream); // get the number of streams we need from the pool and tell them to wait on the H2D copies int const nkernels = std::bitset<32>(kernel_mask).count(); auto streams = cudf::detail::fork_streams(_stream, nkernels); - auto const level_type_size = _pass_itm_data->level_type_size; - // launch string decoder int s_idx = 0; - if (has_strings) { - auto& stream = streams[s_idx++]; - chunk_nested_str_data.host_to_device_async(stream); - gpu::DecodeStringPageData( - pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream); + if (BitAnd(kernel_mask, decode_kernel_mask::STRING) != 0) { + DecodeStringPageData( + pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); + } + + // launch delta byte array decoder + if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BYTE_ARRAY) != 0) { + DecodeDeltaByteArray( + pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } // launch delta binary decoder - if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) { - gpu::DecodeDeltaBinary( + if (BitAnd(kernel_mask, decode_kernel_mask::DELTA_BINARY) != 0) { + DecodeDeltaBinary( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } // launch the catch-all page decoder - if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) { - gpu::DecodePageData( + if (BitAnd(kernel_mask, decode_kernel_mask::GENERAL) != 0) { + DecodePageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } @@ -199,11 +209,8 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) page_nesting.device_to_host_async(_stream); page_nesting_decode.device_to_host_async(_stream); - auto const decode_error = error_code.value(_stream); - if (decode_error != 0) { - std::stringstream stream; - stream << std::hex << decode_error; - CUDF_FAIL("Parquet data decode failed with code(s) 0x" + stream.str()); + if (error_code.value() != 0) { + CUDF_FAIL("Parquet data decode failed with code(s) " + error_code.str()); } // for list columns, add the final offset to every offset buffer. @@ -248,13 +255,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // update null counts in the final column buffers for (size_t idx = 0; idx < pages.size(); idx++) { - gpu::PageInfo* pi = &pages[idx]; - if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } - gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; + PageInfo* pi = &pages[idx]; + if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } + ColumnChunkDesc* col = &chunks[pi->chunk_idx]; input_column_info const& input_col = _input_columns[col->src_col_index]; - int index = pi->nesting_decode - page_nesting_decode.device_ptr(); - gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; + int index = pi->nesting_decode - page_nesting_decode.device_ptr(); + PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; auto* cols = &_output_buffers; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { @@ -320,7 +327,7 @@ reader::impl::impl(std::size_t chunk_read_limit, // Save the states of the output buffers for reuse in `chunk_read()`. for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -349,14 +356,14 @@ void reader::impl::prepare_data(int64_t skip_rows, not _input_columns.empty()) { // fills in chunk information without physically loading or decompressing // the associated data - load_global_chunk_info(); + create_global_chunk_info(); // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now // we will read an entire row group at a time. However, it is possible to do // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and // changed the high level structure such that we weren't always reading an entire table's // worth of columns at once. - compute_input_pass_row_group_info(); + compute_input_passes(); } _file_preprocessed = true; @@ -364,16 +371,16 @@ void reader::impl::prepare_data(int64_t skip_rows, // if we have to start a new pass, do that now if (!_pass_preprocessed) { - auto const num_passes = _input_pass_row_group_offsets.size() - 1; + auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; // always create the pass struct, even if we end up with no passes. // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() && not _input_columns.empty() && _current_input_pass < num_passes) { // setup the pass_intermediate_info for this pass. - setup_pass(); + setup_next_pass(); load_and_decompress_data(); preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit); @@ -521,7 +528,7 @@ table_with_metadata reader::impl::read_chunk() if (_chunk_count > 0) { _output_buffers.resize(0); for (auto const& buff : _output_buffers_template) { - _output_buffers.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -541,8 +548,8 @@ bool reader::impl::has_next() {} /*row_group_indices, empty means read all row groups*/, std::nullopt /*filter*/); - auto const num_input_passes = - _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1; + size_t const num_input_passes = std::max( + int64_t{0}, static_cast(_file_itm_data.input_pass_row_group_offsets.size()) - 1); return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) || (_current_input_pass < num_input_passes); } @@ -571,4 +578,4 @@ parquet_metadata read_parquet_metadata(host_span con metadata.get_key_value_metadata()[0]}; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 9445e4d1648..cea4ba35606 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -22,6 +22,7 @@ #pragma once #include "parquet_gpu.hpp" +#include "reader_impl_chunking.hpp" #include "reader_impl_helpers.hpp" #include @@ -35,7 +36,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { /** * @brief Implementation for Parquet reader @@ -136,10 +137,6 @@ class reader::impl { host_span const> row_group_indices, std::optional> filter); - void load_global_chunk_info(); - void compute_input_pass_row_group_info(); - void setup_pass(); - /** * @brief Create chunk information and start file reads * @@ -250,6 +247,31 @@ class reader::impl { */ void decode_page_data(size_t skip_rows, size_t num_rows); + /** + * @brief Creates file-wide parquet chunk information. + * + * Creates information about all chunks in the file, storing it in + * the file-wide _file_itm_data structure. + */ + void create_global_chunk_info(); + + /** + * @brief Computes all of the passes we will perform over the file. + */ + void compute_input_passes(); + + /** + * @brief Close out the existing pass (if any) and prepare for the next pass. + */ + void setup_next_pass(); + + /** + * @brief Given a set of pages that have had their sizes computed by nesting level and + * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing + * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. + */ + void compute_splits_for_pass(); + private: rmm::cuda_stream_view _stream; rmm::mr::device_memory_resource* _mr = nullptr; @@ -261,10 +283,10 @@ class reader::impl { std::vector _input_columns; // Buffers for generating output columns - std::vector _output_buffers; + std::vector _output_buffers; // Buffers copied from `_output_buffers` after construction for reuse - std::vector _output_buffers_template; + std::vector _output_buffers_template; // _output_buffers associated schema indices std::vector _output_column_schemas; @@ -278,27 +300,24 @@ class reader::impl { // chunked reading happens in 2 parts: // - // At the top level there is the "pass" in which we try and limit the + // At the top level, the entire file is divided up into "passes" omn which we try and limit the // total amount of temporary memory (compressed data, decompressed data) in use // via _input_pass_read_limit. // // Within a pass, we produce one or more chunks of output, whose maximum total // byte size is controlled by _output_chunk_read_limit. - cudf::io::parquet::gpu::file_intermediate_data _file_itm_data; - std::unique_ptr _pass_itm_data; - - // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents - // the start/end of the chunks to be loaded for a given pass. - std::vector _input_pass_row_group_offsets{}; - std::vector _input_pass_row_count{}; - std::size_t _current_input_pass{0}; - std::size_t _chunk_count{0}; + file_intermediate_data _file_itm_data; + bool _file_preprocessed{false}; - std::size_t _output_chunk_read_limit{0}; - std::size_t _input_pass_read_limit{0}; + std::unique_ptr _pass_itm_data; bool _pass_preprocessed{false}; - bool _file_preprocessed{false}; + + std::size_t _output_chunk_read_limit{0}; // output chunk size limit in bytes + std::size_t _input_pass_read_limit{0}; // input pass memory usage limit in bytes + + std::size_t _current_input_pass{0}; // current input pass index + std::size_t _chunk_count{0}; // how many output chunks we have produced }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu new file mode 100644 index 00000000000..213fc380a34 --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -0,0 +1,599 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl.hpp" +#include "reader_impl_chunking.hpp" + +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +namespace { + +struct cumulative_row_info { + size_t row_count; // cumulative row count + size_t size_bytes; // cumulative size in bytes + int key; // schema index +}; + +#if defined(CHUNKING_DEBUG) +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, + rmm::device_uvector const& page_index, + rmm::device_uvector const& c_info, + rmm::cuda_stream_view stream) +{ + pages.device_to_host_sync(stream); + + printf("------------\nCumulative sizes by page\n"); + + std::vector schemas(pages.size()); + std::vector h_page_index(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); + std::vector h_cinfo(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); + auto schema_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); + thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); + auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); + schemas.resize(last - schemas.begin()); + printf("Num schemas: %lu\n", schemas.size()); + + for (size_t idx = 0; idx < schemas.size(); idx++) { + printf("Schema %d\n", schemas[idx]); + for (size_t pidx = 0; pidx < pages.size(); pidx++) { + auto const& page = pages[h_page_index[pidx]]; + if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + continue; + } + printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); + } + } +} + +void print_cumulative_row_info(host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) +{ + if (splits.has_value()) { + printf("------------\nSplits\n"); + for (size_t idx = 0; idx < splits->size(); idx++) { + printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); + } + } + + printf("------------\nCumulative sizes %s\n", label.c_str()); + for (size_t idx = 0; idx < sizes.size(); idx++) { + printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); + if (splits.has_value()) { + // if we have a split at this row count and this is the last instance of this row count + auto start = thrust::make_transform_iterator( + splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); + auto end = start + splits->size(); + auto split = std::find(start, end, sizes[idx].row_count); + auto const split_index = [&]() -> int { + if (split != end && + ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { + return static_cast(std::distance(start, split)); + } + return idx == 0 ? 0 : -1; + }(); + if (split_index >= 0) { + printf(" <-- split {%lu, %lu}", + splits.value()[split_index].skip_rows, + splits.value()[split_index].num_rows); + } + } + printf("\n"); + } +} +#endif // CHUNKING_DEBUG + +/** + * @brief Functor which reduces two cumulative_row_info structs of the same key. + */ +struct cumulative_row_sum { + cumulative_row_info operator() + __device__(cumulative_row_info const& a, cumulative_row_info const& b) const + { + return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; + } +}; + +/** + * @brief Functor which computes the total data size for a given type of cudf column. + * + * In the case of strings, the return size does not include the chars themselves. That + * information is tracked separately (see PageInfo::str_bytes). + */ +struct row_size_functor { + __device__ size_t validity_size(size_t num_rows, bool nullable) + { + return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; + } + + template + __device__ size_t operator()(size_t num_rows, bool nullable) + { + auto const element_size = sizeof(device_storage_type_t); + return (element_size * num_rows) + validity_size(num_rows, nullable); + } +}; + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + auto const offset_size = sizeof(size_type); + // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset + // for the entire column, whereas this is adding an extra offset per page. So we will get a + // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better + // to overestimate size somewhat than to underestimate it and potentially generate chunks + // that are too large. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + return validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + // only returns the size of offsets and validity. the size of the actual string chars + // is tracked separately. + auto const offset_size = sizeof(size_type); + // see note about offsets in the list_view template. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +/** + * @brief Functor which computes the total output cudf data size for all of + * the data in this page. + * + * Sums across all nesting levels. + */ +struct get_cumulative_row_info { + PageInfo const* const pages; + + __device__ cumulative_row_info operator()(size_type index) + { + auto const& page = pages[index]; + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { + return cumulative_row_info{0, 0, page.src_col_schema}; + } + + // total nested size, not counting string data + auto iter = + cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { + auto const& pni = page.nesting[i]; + return cudf::type_dispatcher( + data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); + }); + + size_t const row_count = static_cast(page.nesting[0].size); + return { + row_count, + thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, + page.src_col_schema}; + } +}; + +/** + * @brief Functor which computes the effective size of all input columns by page. + * + * For a given row, we want to find the cost of all pages for all columns involved + * in loading up to that row. The complication here is that not all pages are the + * same size between columns. Example: + * + * page row counts + * Column A: 0 <----> 100 <----> 200 + * Column B: 0 <---------------> 200 <--------> 400 + | + * if we decide to split at row 100, we don't really know the actual amount of bytes in column B + * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + * page. Essentially, a conservative over-estimate of the real size. + */ +struct row_total_size { + cumulative_row_info const* c_info; + size_type const* key_offsets; + size_t num_keys; + + __device__ cumulative_row_info operator()(cumulative_row_info const& i) + { + // sum sizes for each input column at this row + size_t sum = 0; + for (int idx = 0; idx < num_keys; idx++) { + auto const start = key_offsets[idx]; + auto const end = key_offsets[idx + 1]; + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&] __device__(size_type i) { return c_info[i].row_count; }); + auto const page_index = + thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; + sum += c_info[page_index].size_bytes; + } + return {i.row_count, sum, i.key}; + } +}; + +/** + * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read + * limit, determine the set of splits. + * + * @param sizes Vector of cumulative {row_count, byte_size} pairs + * @param num_rows Total number of rows to read + * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns + */ +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) +{ + // now we have an array of {row_count, real output bytes}. just walk through it and generate + // splits. + // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch + // sizes are reasonably large, this shouldn't iterate too many times + std::vector splits; + { + size_t cur_pos = 0; + size_t cur_cumulative_size = 0; + size_t cur_row_count = 0; + auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { + return i.size_bytes - cur_cumulative_size; + }); + auto end = start + sizes.size(); + while (cur_row_count < num_rows) { + int64_t split_pos = + thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; + + // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back + // one. + if (static_cast(split_pos) >= sizes.size() || + (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { + split_pos--; + } + + // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in + // a loop because all of the cumulative sizes for all the pages are sorted into one big list. + // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in + // the list twice. so we have to iterate until we skip past all of them. The idea is that we + // either do this, or we have to call unique() on the input first. + while (split_pos < (static_cast(sizes.size()) - 1) && + (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { + split_pos++; + } + + auto const start_row = cur_row_count; + cur_row_count = sizes[split_pos].row_count; + splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); + cur_pos = split_pos; + cur_cumulative_size = sizes[split_pos].size_bytes; + } + } + // print_cumulative_row_info(sizes, "adjusted", splits); + + return splits; +} + +/** + * @brief Converts cuDF units to Parquet units. + * + * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + */ +[[nodiscard]] std::tuple conversion_info( + type_id column_type_id, + type_id timestamp_type_id, + Type physical, + thrust::optional converted, + int32_t length) +{ + int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t clock_rate = 0; + if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { + type_width = 1; // I32 -> I8 + } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { + type_width = 2; // I32 -> I16 + } else if (column_type_id == type_id::INT32) { + type_width = 4; // str -> hash32 + } else if (is_chrono(data_type{column_type_id})) { + clock_rate = to_clockrate(timestamp_type_id); + } + + int8_t converted_type = converted.value_or(UNKNOWN); + if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && + not cudf::is_fixed_point(data_type{column_type_id})) { + converted_type = UNKNOWN; // Not converting to float64 or decimal + } + return std::make_tuple(type_width, clock_rate, converted_type); +} + +/** + * @brief Return the required number of bits to store a value. + */ +template +[[nodiscard]] T required_bits(uint32_t max_level) +{ + return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); +} + +struct row_count_compare { + __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) + { + return a.row_count < b.row_count; + } +}; + +} // anonymous namespace + +void reader::impl::create_global_chunk_info() +{ + auto const num_rows = _file_itm_data.global_num_rows; + auto const& row_groups_info = _file_itm_data.row_groups; + auto& chunks = _file_itm_data.chunks; + + // Descriptors for all the chunks that make up the selected columns + auto const num_input_columns = _input_columns.size(); + auto const num_chunks = row_groups_info.size() * num_input_columns; + + // Initialize column chunk information + auto remaining_rows = num_rows; + for (auto const& rg : row_groups_info) { + auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); + auto const row_group_start = rg.start_row; + auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); + + // generate ColumnChunkDesc objects for everything to be decoded (all input columns) + for (size_t i = 0; i < num_input_columns; ++i) { + auto col = _input_columns[i]; + // look up metadata + auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); + auto& schema = _metadata->get_schema(col.schema_idx); + + auto [type_width, clock_rate, converted_type] = + conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), + _timestamp_type.id(), + schema.type, + schema.converted_type, + schema.type_length); + + chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); + } + + remaining_rows -= row_group_rows; + } +} + +void reader::impl::compute_input_passes() +{ + // at this point, row_groups has already been filtered down to just the row groups we need to + // handle optional skip_rows/num_rows parameters. + auto const& row_groups_info = _file_itm_data.row_groups; + + // if the user hasn't specified an input size limit, read everything in a single pass. + if (_input_pass_read_limit == 0) { + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + return; + } + + // generate passes. make sure to account for the case where a single row group doesn't fit within + // + std::size_t const read_limit = + _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); + std::size_t cur_pass_byte_size = 0; + std::size_t cur_rg_start = 0; + std::size_t cur_row_count = 0; + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_count.push_back(0); + + for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { + auto const& rgi = row_groups_info[cur_rg_index]; + auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); + + // can we add this row group + if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { + // A single row group (the current one) is larger than the read limit: + // We always need to include at least one row group, so end the pass at the end of the current + // row group + if (cur_rg_start == cur_rg_index) { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1); + _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows); + cur_rg_start = cur_rg_index + 1; + cur_pass_byte_size = 0; + } + // End the pass at the end of the previous row group + else { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + cur_rg_start = cur_rg_index; + cur_pass_byte_size = row_group.total_byte_size; + } + } else { + cur_pass_byte_size += row_group.total_byte_size; + } + cur_row_count += row_group.num_rows; + } + // add the last pass if necessary + if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) { + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + } +} + +void reader::impl::setup_next_pass() +{ + // this will also cause the previous pass information to be deleted + _pass_itm_data = std::make_unique(); + + // setup row groups to be loaded for this pass + auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass]; + auto const row_group_end = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1]; + auto const num_row_groups = row_group_end - row_group_start; + _pass_itm_data->row_groups.resize(num_row_groups); + std::copy(_file_itm_data.row_groups.begin() + row_group_start, + _file_itm_data.row_groups.begin() + row_group_end, + _pass_itm_data->row_groups.begin()); + + auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; + CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); + + auto const chunks_per_rowgroup = _input_columns.size(); + auto const num_chunks = chunks_per_rowgroup * num_row_groups; + + auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); + auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); + + _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); + std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); + + // adjust skip_rows and num_rows by what's available in the row groups we are processing + if (num_passes == 1) { + _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; + _pass_itm_data->num_rows = _file_itm_data.global_num_rows; + } else { + auto const global_start_row = _file_itm_data.global_skip_rows; + auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; + auto const start_row = + std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row); + auto const end_row = + std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row); + + // skip_rows is always global in the sense that it is relative to the first row of + // everything we will be reading, regardless of what pass we are on. + // num_rows is how many rows we are reading this pass. + _pass_itm_data->skip_rows = + global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass]; + _pass_itm_data->num_rows = end_row - start_row; + } +} + +void reader::impl::compute_splits_for_pass() +{ + auto const skip_rows = _pass_itm_data->skip_rows; + auto const num_rows = _pass_itm_data->num_rows; + + // simple case : no chunk size, no splits + if (_output_chunk_read_limit <= 0) { + _pass_itm_data->output_chunk_read_info = std::vector{{skip_rows, num_rows}}; + return; + } + + auto& pages = _pass_itm_data->pages_info; + + auto const& page_keys = _pass_itm_data->page_keys; + auto const& page_index = _pass_itm_data->page_index; + + // generate cumulative row counts and sizes + rmm::device_uvector c_info(page_keys.size(), _stream); + // convert PageInfo to cumulative_row_info + auto page_input = thrust::make_transform_iterator(page_index.begin(), + get_cumulative_row_info{pages.device_ptr()}); + thrust::inclusive_scan_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + page_input, + c_info.begin(), + thrust::equal_to{}, + cumulative_row_sum{}); + // print_cumulative_page_info(pages, page_index, c_info, stream); + + // sort by row count + rmm::device_uvector c_info_sorted{c_info, _stream}; + thrust::sort( + rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{}); + + // std::vector h_c_info_sorted(c_info_sorted.size()); + // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), + // c_info_sorted.data(), + // sizeof(cumulative_row_info) * c_info_sorted.size(), + // cudaMemcpyDefault)); + // print_cumulative_row_info(h_c_info_sorted, "raw"); + + // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per + // key + rmm::device_uvector key_offsets(page_keys.size() + 1, _stream); + auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + thrust::make_constant_iterator(1), + thrust::make_discard_iterator(), + key_offsets.begin()) + .second; + size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); + thrust::exclusive_scan( + rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); + + // adjust the cumulative info such that for each row count, the size includes any pages that span + // that row count. this is so that if we have this case: + // page row counts + // Column A: 0 <----> 100 <----> 200 + // Column B: 0 <---------------> 200 <--------> 400 + // | + // if we decide to split at row 100, we don't really know the actual amount of bytes in column B + // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + // page. + // + rmm::device_uvector aggregated_info(c_info.size(), _stream); + thrust::transform(rmm::exec_policy(_stream), + c_info_sorted.begin(), + c_info_sorted.end(), + aggregated_info.begin(), + row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); + + // bring back to the cpu + std::vector h_aggregated_info(aggregated_info.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), + aggregated_info.data(), + sizeof(cumulative_row_info) * c_info.size(), + cudaMemcpyDefault, + _stream.value())); + _stream.synchronize(); + + // generate the actual splits + _pass_itm_data->output_chunk_read_info = + find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp new file mode 100644 index 00000000000..dfc239d8451 --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "reader_impl_helpers.hpp" + +#include + +namespace cudf::io::parquet::detail { + +/** + * @brief Struct to store file-level data that remains constant for + * all passes/chunks in the file. + */ +struct file_intermediate_data { + // all row groups to read + std::vector row_groups{}; + + // all chunks from the selected row groups. We may end up reading these chunks progressively + // instead of all at once + std::vector chunks{}; + + // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents + // the start/end of the chunks to be loaded for a given pass. + std::vector input_pass_row_group_offsets{}; + // row counts per input-pass + std::vector input_pass_row_count{}; + + // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we + // may not be visiting every row group that contains these bounds + size_t global_skip_rows; + size_t global_num_rows; +}; + +/** + * @brief Struct to identify the range for each chunk of rows during a chunked reading pass. + */ +struct chunk_read_info { + size_t skip_rows; + size_t num_rows; +}; + +/** + * @brief Struct to store pass-level data that remains constant for a single pass. + */ +struct pass_intermediate_data { + std::vector> raw_page_data; + rmm::device_buffer decomp_page_data; + + // rowgroup, chunk and page information for the current pass. + std::vector row_groups{}; + cudf::detail::hostdevice_vector chunks{}; + cudf::detail::hostdevice_vector pages_info{}; + cudf::detail::hostdevice_vector page_nesting_info{}; + cudf::detail::hostdevice_vector page_nesting_decode_info{}; + + rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; + rmm::device_uvector page_index{0, rmm::cuda_stream_default}; + rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; + + std::vector output_chunk_read_info; + std::size_t current_output_chunk{0}; + + rmm::device_buffer level_decode_data{}; + int level_type_size{0}; + + // skip_rows and num_rows values for this particular pass. these may be adjusted values from the + // global values stored in file_intermediate_data. + size_t skip_rows; + size_t num_rows; +}; + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index fcaa610fbb7..a9c84143e1a 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -21,50 +21,48 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { -ConvertedType logical_type_to_converted_type(LogicalType const& logical) +ConvertedType logical_type_to_converted_type(thrust::optional const& logical) { - if (logical.isset.STRING) { - return parquet::UTF8; - } else if (logical.isset.MAP) { - return parquet::MAP; - } else if (logical.isset.LIST) { - return parquet::LIST; - } else if (logical.isset.ENUM) { - return parquet::ENUM; - } else if (logical.isset.DECIMAL) { - return parquet::DECIMAL; // TODO set decimal values - } else if (logical.isset.DATE) { - return parquet::DATE; - } else if (logical.isset.TIME) { - if (logical.TIME.unit.isset.MILLIS) - return parquet::TIME_MILLIS; - else if (logical.TIME.unit.isset.MICROS) - return parquet::TIME_MICROS; - } else if (logical.isset.TIMESTAMP) { - if (logical.TIMESTAMP.unit.isset.MILLIS) - return parquet::TIMESTAMP_MILLIS; - else if (logical.TIMESTAMP.unit.isset.MICROS) - return parquet::TIMESTAMP_MICROS; - } else if (logical.isset.INTEGER) { - switch (logical.INTEGER.bitWidth) { - case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; - case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16; - case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32; - case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64; - default: break; - } - } else if (logical.isset.UNKNOWN) { - return parquet::NA; - } else if (logical.isset.JSON) { - return parquet::JSON; - } else if (logical.isset.BSON) { - return parquet::BSON; + if (not logical.has_value()) { return UNKNOWN; } + switch (logical->type) { + case LogicalType::STRING: return UTF8; + case LogicalType::MAP: return MAP; + case LogicalType::LIST: return LIST; + case LogicalType::ENUM: return ENUM; + case LogicalType::DECIMAL: return DECIMAL; // TODO use decimal scale/precision + case LogicalType::DATE: return DATE; + case LogicalType::TIME: + if (logical->is_time_millis()) { + return TIME_MILLIS; + } else if (logical->is_time_micros()) { + return TIME_MICROS; + } + break; + case LogicalType::TIMESTAMP: + if (logical->is_timestamp_millis()) { + return TIMESTAMP_MILLIS; + } else if (logical->is_timestamp_micros()) { + return TIMESTAMP_MICROS; + } + break; + case LogicalType::INTEGER: + switch (logical->bit_width()) { + case 8: return logical->is_signed() ? INT_8 : UINT_8; + case 16: return logical->is_signed() ? INT_16 : UINT_16; + case 32: return logical->is_signed() ? INT_32 : UINT_32; + case 64: return logical->is_signed() ? INT_64 : UINT_64; + default: break; + } + case LogicalType::UNKNOWN: return NA; + case LogicalType::JSON: return JSON; + case LogicalType::BSON: return BSON; + default: break; } - return parquet::UNKNOWN; + return UNKNOWN; } } // namespace @@ -76,39 +74,39 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - parquet::Type const physical = schema.type; - parquet::LogicalType const logical_type = schema.logical_type; - parquet::ConvertedType converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; + auto const physical = schema.type; + auto const logical_type = schema.logical_type; + auto converted_type = schema.converted_type; + int32_t decimal_precision = schema.decimal_precision; + // FIXME(ets): this should just use logical type to deduce the type_id. then fall back to + // converted_type if logical_type isn't set // Logical type used for actual data interpretation; the legacy converted type // is superseded by 'logical' type whenever available. auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == parquet::DECIMAL) { - decimal_precision = schema.logical_type.DECIMAL.precision; - } - - switch (converted_type) { - case parquet::UINT_8: return type_id::UINT8; - case parquet::INT_8: return type_id::INT8; - case parquet::UINT_16: return type_id::UINT16; - case parquet::INT_16: return type_id::INT16; - case parquet::UINT_32: return type_id::UINT32; - case parquet::UINT_64: return type_id::UINT64; - case parquet::DATE: return type_id::TIMESTAMP_DAYS; - case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS; - case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS; - case parquet::TIMESTAMP_MILLIS: + if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } + if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type->precision(); } + + switch (converted_type.value_or(UNKNOWN)) { + case UINT_8: return type_id::UINT8; + case INT_8: return type_id::INT8; + case UINT_16: return type_id::UINT16; + case INT_16: return type_id::INT16; + case UINT_32: return type_id::UINT32; + case UINT_64: return type_id::UINT64; + case DATE: return type_id::TIMESTAMP_DAYS; + case TIME_MILLIS: return type_id::DURATION_MILLISECONDS; + case TIME_MICROS: return type_id::DURATION_MICROSECONDS; + case TIMESTAMP_MILLIS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MILLISECONDS; - case parquet::TIMESTAMP_MICROS: + case TIMESTAMP_MICROS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MICROSECONDS; - case parquet::DECIMAL: - if (physical == parquet::INT32) { return type_id::DECIMAL32; } - if (physical == parquet::INT64) { return type_id::DECIMAL64; } - if (physical == parquet::FIXED_LEN_BYTE_ARRAY) { + case DECIMAL: + if (physical == INT32) { return type_id::DECIMAL32; } + if (physical == INT64) { return type_id::DECIMAL64; } + if (physical == FIXED_LEN_BYTE_ARRAY) { if (schema.type_length <= static_cast(sizeof(int32_t))) { return type_id::DECIMAL32; } @@ -119,7 +117,7 @@ type_id to_type_id(SchemaElement const& schema, return type_id::DECIMAL128; } } - if (physical == parquet::BYTE_ARRAY) { + if (physical == BYTE_ARRAY) { CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); if (decimal_precision <= MAX_DECIMAL32_PRECISION) { return type_id::DECIMAL32; @@ -133,22 +131,20 @@ type_id to_type_id(SchemaElement const& schema, break; // maps are just List>. - case parquet::MAP: - case parquet::LIST: return type_id::LIST; - case parquet::NA: return type_id::STRING; + case MAP: + case LIST: return type_id::LIST; + case NA: return type_id::STRING; // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support default: break; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and - logical_type.TIMESTAMP.unit.isset.NANOS) { - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_NANOSECONDS; - } - - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and - logical_type.TIME.unit.isset.NANOS) { - return type_id::DURATION_NANOSECONDS; + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.has_value()) { + if (logical_type->is_timestamp_nanos()) { + return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id + : type_id::TIMESTAMP_NANOSECONDS; + } else if (logical_type->is_time_nanos()) { + return type_id::DURATION_NANOSECONDS; + } } // is it simply a struct? @@ -157,16 +153,16 @@ type_id to_type_id(SchemaElement const& schema, // Physical storage type supported by Parquet; controls the on-disk storage // format in combination with the encoding type. switch (physical) { - case parquet::BOOLEAN: return type_id::BOOL8; - case parquet::INT32: return type_id::INT32; - case parquet::INT64: return type_id::INT64; - case parquet::FLOAT: return type_id::FLOAT32; - case parquet::DOUBLE: return type_id::FLOAT64; - case parquet::BYTE_ARRAY: - case parquet::FIXED_LEN_BYTE_ARRAY: + case BOOLEAN: return type_id::BOOL8; + case INT32: return type_id::INT32; + case INT64: return type_id::INT64; + case FLOAT: return type_id::FLOAT32; + case DOUBLE: return type_id::FLOAT64; + case BYTE_ARRAY: + case FIXED_LEN_BYTE_ARRAY: // Can be mapped to INT32 (32-bit hash) or STRING return strings_to_categorical ? type_id::INT32 : type_id::STRING; - case parquet::INT96: + case INT96: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; default: break; @@ -175,6 +171,81 @@ type_id to_type_id(SchemaElement const& schema, return type_id::EMPTY; } +void metadata::sanitize_schema() +{ + // Parquet isn't very strict about incoming metadata. Lots of things can and should be inferred. + // There are also a lot of rules that simply aren't followed and are expected to be worked around. + // This step sanitizes the metadata to something that isn't ambiguous. + // + // Take, for example, the following schema: + // + // required group field_id=-1 user { + // required int32 field_id=-1 id; + // optional group field_id=-1 phoneNumbers { + // repeated group field_id=-1 phone { + // required int64 field_id=-1 number; + // optional binary field_id=-1 kind (String); + // } + // } + // } + // + // This real-world example has no annotations telling us what is a list or a struct. On the + // surface this looks like a column of id's and a column of list>, but this + // actually should be interpreted as a struct>>. The phoneNumbers field + // has to be a struct because it is a group with no repeated tag and we have no annotation. The + // repeated group is actually BOTH a struct due to the multiple children and a list due to + // repeated. + // + // This code attempts to make this less messy for the code that follows. + + std::function process = [&](size_t schema_idx) -> void { + if (schema_idx < 0) { return; } + auto& schema_elem = schema[schema_idx]; + if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) { + auto const parent_type = schema[schema_elem.parent_idx].converted_type; + if (schema_elem.repetition_type == REPEATED && schema_elem.num_children > 1 && + parent_type != LIST && parent_type != MAP) { + // This is a list of structs, so we need to mark this as a list, but also + // add a struct child and move this element's children to the struct + schema_elem.converted_type = LIST; + schema_elem.repetition_type = OPTIONAL; + auto const struct_node_idx = static_cast(schema.size()); + + SchemaElement struct_elem; + struct_elem.name = "struct_node"; + struct_elem.repetition_type = REQUIRED; + struct_elem.num_children = schema_elem.num_children; + struct_elem.type = UNDEFINED_TYPE; + struct_elem.converted_type = UNKNOWN; + + // swap children + struct_elem.children_idx = std::move(schema_elem.children_idx); + schema_elem.children_idx = {struct_node_idx}; + schema_elem.num_children = 1; + + struct_elem.max_definition_level = schema_elem.max_definition_level; + struct_elem.max_repetition_level = schema_elem.max_repetition_level; + schema_elem.max_definition_level--; + schema_elem.max_repetition_level = schema[schema_elem.parent_idx].max_repetition_level; + + // change parent index on new node and on children + struct_elem.parent_idx = schema_idx; + for (auto& child_idx : struct_elem.children_idx) { + schema[child_idx].parent_idx = struct_node_idx; + } + // add our struct + schema.push_back(struct_elem); + } + } + + for (auto& child_idx : schema_elem.children_idx) { + process(child_idx); + } + }; + + process(0); +} + metadata::metadata(datasource* source) { constexpr auto header_len = sizeof(file_header_s); @@ -195,6 +266,7 @@ metadata::metadata(datasource* source) CompactProtocolReader cp(buffer->data(), ender->footer_len); CUDF_EXPECTS(cp.read(this), "Cannot parse metadata"); CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); + sanitize_schema(); } std::vector aggregate_reader_metadata::metadatas_from_sources( @@ -344,7 +416,7 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con return names; } -std::tuple> +std::tuple> aggregate_reader_metadata::select_row_groups( host_span const> row_group_indices, int64_t skip_rows_opt, @@ -362,7 +434,7 @@ aggregate_reader_metadata::select_row_groups( host_span const>(filtered_row_group_indices.value()); } } - std::vector selection; + std::vector selection; auto [rows_to_skip, rows_to_read] = [&]() { if (not row_group_indices.empty()) { return std::pair{}; } auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options( @@ -402,7 +474,7 @@ aggregate_reader_metadata::select_row_groups( } std::tuple, - std::vector, + std::vector, std::vector> aggregate_reader_metadata::select_columns(std::optional> const& use_names, bool include_index, @@ -420,17 +492,18 @@ aggregate_reader_metadata::select_columns(std::optional : -1; }; - std::vector output_columns; + std::vector output_columns; std::vector input_columns; std::vector nesting; // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is // not a child of "struct1" then the function will return false for "struct1" - std::function&, bool)> + std::function&, bool)> build_column = [&](column_name_info const* col_name_info, int schema_idx, - std::vector& out_col_array, + std::vector& out_col_array, bool has_list_parent) { if (schema_idx < 0) { return false; } auto const& schema_elem = get_schema(schema_idx); @@ -445,13 +518,16 @@ aggregate_reader_metadata::select_columns(std::optional child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent); } + auto const one_level_list = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx)); + // if we're at the root, this is a new output column - auto const col_type = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx)) + auto const col_type = one_level_list ? type_id::LIST : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); - inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer output_col(dtype, + schema_elem.repetition_type == OPTIONAL); if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } // store the index of this element if inserted in out_col_array nesting.push_back(static_cast(out_col_array.size())); @@ -485,13 +561,14 @@ aggregate_reader_metadata::select_columns(std::optional input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0}); // set up child output column for one-level encoding list - if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { + if (one_level_list) { // determine the element data type auto const element_type = to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const element_dtype = to_data_type(element_type, schema_elem); - inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer element_col( + element_dtype, schema_elem.repetition_type == OPTIONAL); if (has_list_parent || col_type == type_id::LIST) { element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } @@ -506,9 +583,7 @@ aggregate_reader_metadata::select_columns(std::optional std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); // pop off the extra nesting element. - if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { - nesting.pop_back(); - } + if (one_level_list) { nesting.pop_back(); } path_is_valid = true; // If we're able to reach leaf then path is valid } @@ -656,4 +731,4 @@ aggregate_reader_metadata::select_columns(std::optional std::move(input_columns), std::move(output_columns), std::move(output_column_schemas)); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 61e4f94df0f..8d8ab8707be 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -32,9 +32,24 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { -using namespace cudf::io::parquet; +/** + * @brief The row_group_info class + */ +struct row_group_info { + size_type index; // row group index within a file. aggregate_reader_metadata::get_row_group() is + // called with index and source_index + size_t start_row; + size_type source_index; // file index. + + row_group_info() = default; + + row_group_info(size_type index, size_t start_row, size_type source_index) + : index{index}, start_row{start_row}, source_index{source_index} + { + } +}; /** * @brief Function that translates Parquet datatype to cuDF type enum @@ -58,6 +73,7 @@ using namespace cudf::io::parquet; */ struct metadata : public FileMetaData { explicit metadata(datasource* source); + void sanitize_schema(); }; class aggregate_reader_metadata { @@ -181,7 +197,7 @@ class aggregate_reader_metadata { * @return A tuple of corrected row_start, row_count and list of row group indexes and its * starting row */ - [[nodiscard]] std::tuple> select_row_groups( + [[nodiscard]] std::tuple> select_row_groups( host_span const> row_group_indices, int64_t row_start, std::optional const& row_count, @@ -201,12 +217,13 @@ class aggregate_reader_metadata { * @return input column information, output column information, list of output column schema * indices */ - [[nodiscard]] std:: - tuple, std::vector, std::vector> - select_columns(std::optional> const& use_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) const; + [[nodiscard]] std::tuple, + std::vector, + std::vector> + select_columns(std::optional> const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const; }; /** @@ -275,4 +292,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer std::list _operators; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index c731c467f2c..0bc492546e9 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -14,11 +14,11 @@ * limitations under the License. */ +#include "error.hpp" #include "reader_impl.hpp" #include #include -#include #include #include @@ -43,7 +43,7 @@ #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { /** @@ -169,46 +169,6 @@ void generate_depth_remappings(std::map, std::ve } } -/** - * @brief Return the required number of bits to store a value. - */ -template -[[nodiscard]] T required_bits(uint32_t max_level) -{ - return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); -} - -/** - * @brief Converts cuDF units to Parquet units. - * - * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. - */ -[[nodiscard]] std::tuple conversion_info(type_id column_type_id, - type_id timestamp_type_id, - parquet::Type physical, - int8_t converted, - int32_t length) -{ - int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; - int32_t clock_rate = 0; - if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { - type_width = 1; // I32 -> I8 - } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { - type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::INT32) { - type_width = 4; // str -> hash32 - } else if (is_chrono(data_type{column_type_id})) { - clock_rate = to_clockrate(timestamp_type_id); - } - - int8_t converted_type = converted; - if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && - not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal - } - return std::make_tuple(type_width, clock_rate, converted_type); -} - /** * @brief Reads compressed page data to device memory. * @@ -226,7 +186,7 @@ template [[nodiscard]] std::future read_column_chunks_async( std::vector> const& sources, std::vector>& page_data, - cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, std::vector const& column_chunk_offsets, @@ -239,11 +199,10 @@ template size_t const io_offset = column_chunk_offsets[chunk]; size_t io_size = chunks[chunk].compressed_size; size_t next_chunk = chunk + 1; - bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED); + bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED); while (next_chunk < end_chunk) { - size_t const next_offset = column_chunk_offsets[next_chunk]; - bool const is_next_compressed = - (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED); + size_t const next_offset = column_chunk_offsets[next_chunk]; + bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED); if (next_offset != io_offset + io_size || is_next_compressed != is_compressed || chunk_source_map[chunk] != chunk_source_map[next_chunk]) { // Can't merge if not contiguous or mixing compressed and uncompressed @@ -300,15 +259,20 @@ template * * @return The total number of pages */ -[[nodiscard]] size_t count_page_headers( - cudf::detail::hostdevice_vector& chunks, rmm::cuda_stream_view stream) +[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector& chunks, + rmm::cuda_stream_view stream) { size_t total_pages = 0; + kernel_error error_code(stream); chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream); chunks.device_to_host_sync(stream); + if (error_code.value() != 0) { + CUDF_FAIL("Parquet header parsing failed with code(s) " + error_code.str()); + } + for (size_t c = 0; c < chunks.size(); c++) { total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages; } @@ -316,19 +280,6 @@ template return total_pages; } -// see setupLocalPageInfo() in page_data.cu for supported page encodings -constexpr bool is_supported_encoding(Encoding enc) -{ - switch (enc) { - case Encoding::PLAIN: - case Encoding::PLAIN_DICTIONARY: - case Encoding::RLE: - case Encoding::RLE_DICTIONARY: - case Encoding::DELTA_BINARY_PACKED: return true; - default: return false; - } -} - /** * @brief Decode the page information from the given column chunks. * @@ -337,8 +288,8 @@ constexpr bool is_supported_encoding(Encoding enc) * @param stream CUDA stream used for device memory operations and kernel launches * @returns The size in bytes of level type data required */ -int decode_page_headers(cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, +int decode_page_headers(cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), @@ -349,33 +300,30 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c page_count += chunks[c].max_num_pages; } + kernel_error error_code(stream); chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), error_code.data(), stream); + + if (error_code.value() != 0) { + // TODO(ets): if an unsupported encoding was detected, do extra work to figure out which one + CUDF_FAIL("Parquet header parsing failed with code(s)" + error_code.str()); + } // compute max bytes needed for level data - auto level_bit_size = - cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) { + auto level_bit_size = cudf::detail::make_counting_transform_iterator( + 0, [chunks = chunks.d_begin()] __device__(int i) { auto c = chunks[i]; return static_cast( - max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION])); + max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION])); }); // max level data bit size. - int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), + int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), level_bit_size, level_bit_size + chunks.size(), 0, thrust::maximum()); - auto const level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8)); - - pages.device_to_host_sync(stream); - - // validate page encodings - CUDF_EXPECTS(std::all_of(pages.begin(), - pages.end(), - [](auto const& page) { return is_supported_encoding(page.encoding); }), - "Unsupported page encoding detected"); - return level_type_size; + return std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8)); } /** @@ -388,11 +336,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c * @return Device buffer to decompressed page data */ [[nodiscard]] rmm::device_buffer decompress_page_data( - cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { - auto for_each_codec_page = [&](parquet::Compression codec, std::function const& f) { + auto for_each_codec_page = [&](Compression codec, std::function const& f) { for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { const auto page_stride = chunks[c].max_num_pages; if (chunks[c].codec == codec) { @@ -412,19 +360,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c size_t total_decomp_size = 0; struct codec_stats { - parquet::Compression compression_type = UNCOMPRESSED; - size_t num_pages = 0; - int32_t max_decompressed_size = 0; - size_t total_decomp_size = 0; + Compression compression_type = UNCOMPRESSED; + size_t num_pages = 0; + int32_t max_decompressed_size = 0; + size_t total_decomp_size = 0; }; - std::array codecs{codec_stats{parquet::GZIP}, - codec_stats{parquet::SNAPPY}, - codec_stats{parquet::BROTLI}, - codec_stats{parquet::ZSTD}}; + std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}}; auto is_codec_supported = [&codecs](int8_t codec) { - if (codec == parquet::UNCOMPRESSED) return true; + if (codec == UNCOMPRESSED) return true; return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { return codec == cstats.compression_type; }) != codecs.end(); @@ -445,7 +390,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.num_pages++; num_comp_pages++; }); - if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) { + if (codec.compression_type == BROTLI && codec.num_pages > 0) { debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream); } } @@ -482,7 +427,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c auto& page = pages[page_idx]; // offset will only be non-zero for V2 pages auto const offset = - page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION]; + page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION]; // for V2 need to copy def and rep level info into place, and then offset the // input and output buffers. otherwise we'd have to keep both the compressed // and decompressed data. @@ -509,11 +454,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { - case parquet::GZIP: + case GZIP: gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); break; - case parquet::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { + case SNAPPY: + if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, d_comp_in, d_comp_out, @@ -525,7 +470,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); } break; - case parquet::ZSTD: + case ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, d_comp_in, d_comp_out, @@ -534,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.total_decomp_size, stream); break; - case parquet::BROTLI: + case BROTLI: gpu_debrotli(d_comp_in, d_comp_out, d_comp_res_view, @@ -594,9 +539,9 @@ void reader::impl::allocate_nesting_info() }); page_nesting_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; page_nesting_decode_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; // update pointers in the PageInfos int target_page_index = 0; @@ -653,10 +598,10 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - gpu::PageNestingInfo* pni = + PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; - gpu::PageNestingDecodeInfo* nesting_info = + PageNestingDecodeInfo* nesting_info = &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; // if we have lists, set our start and end depth remappings @@ -717,9 +662,9 @@ void reader::impl::allocate_level_decode_space() for (size_t idx = 0; idx < pages.size(); idx++) { auto& p = pages[idx]; - p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf; + p.lvl_decode_buf[level_type::DEFINITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); - p.lvl_decode_buf[gpu::level_type::REPETITION] = buf; + p.lvl_decode_buf[level_type::REPETITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); } } @@ -793,164 +738,6 @@ std::pair>> reader::impl::read_and_decompres return {total_decompressed_size > 0, std::move(read_chunk_tasks)}; } -void reader::impl::load_global_chunk_info() -{ - auto const num_rows = _file_itm_data.global_num_rows; - auto const& row_groups_info = _file_itm_data.row_groups; - auto& chunks = _file_itm_data.chunks; - - // Descriptors for all the chunks that make up the selected columns - auto const num_input_columns = _input_columns.size(); - auto const num_chunks = row_groups_info.size() * num_input_columns; - - // Initialize column chunk information - auto remaining_rows = num_rows; - for (auto const& rg : row_groups_info) { - auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); - auto const row_group_start = rg.start_row; - auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); - - // generate ColumnChunkDesc objects for everything to be decoded (all input columns) - for (size_t i = 0; i < num_input_columns; ++i) { - auto col = _input_columns[i]; - // look up metadata - auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); - - auto [type_width, clock_rate, converted_type] = - conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), - _timestamp_type.id(), - schema.type, - schema.converted_type, - schema.type_length); - - chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); - } - - remaining_rows -= row_group_rows; - } -} - -void reader::impl::compute_input_pass_row_group_info() -{ - // at this point, row_groups has already been filtered down to just the row groups we need to - // handle optional skip_rows/num_rows parameters. - auto const& row_groups_info = _file_itm_data.row_groups; - - // if the user hasn't specified an input size limit, read everything in a single pass. - if (_input_pass_read_limit == 0) { - _input_pass_row_group_offsets.push_back(0); - _input_pass_row_group_offsets.push_back(row_groups_info.size()); - return; - } - - // generate passes. make sure to account for the case where a single row group doesn't fit within - // - std::size_t const read_limit = - _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); - std::size_t cur_pass_byte_size = 0; - std::size_t cur_rg_start = 0; - std::size_t cur_row_count = 0; - _input_pass_row_group_offsets.push_back(0); - _input_pass_row_count.push_back(0); - - for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { - auto const& rgi = row_groups_info[cur_rg_index]; - auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); - - // can we add this row group - if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { - // A single row group (the current one) is larger than the read limit: - // We always need to include at least one row group, so end the pass at the end of the current - // row group - if (cur_rg_start == cur_rg_index) { - _input_pass_row_group_offsets.push_back(cur_rg_index + 1); - _input_pass_row_count.push_back(cur_row_count + row_group.num_rows); - cur_rg_start = cur_rg_index + 1; - cur_pass_byte_size = 0; - } - // End the pass at the end of the previous row group - else { - _input_pass_row_group_offsets.push_back(cur_rg_index); - _input_pass_row_count.push_back(cur_row_count); - cur_rg_start = cur_rg_index; - cur_pass_byte_size = row_group.total_byte_size; - } - } else { - cur_pass_byte_size += row_group.total_byte_size; - } - cur_row_count += row_group.num_rows; - } - // add the last pass if necessary - if (_input_pass_row_group_offsets.back() != row_groups_info.size()) { - _input_pass_row_group_offsets.push_back(row_groups_info.size()); - _input_pass_row_count.push_back(cur_row_count); - } -} - -void reader::impl::setup_pass() -{ - // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); - - // setup row groups to be loaded for this pass - auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass]; - auto const row_group_end = _input_pass_row_group_offsets[_current_input_pass + 1]; - auto const num_row_groups = row_group_end - row_group_start; - _pass_itm_data->row_groups.resize(num_row_groups); - std::copy(_file_itm_data.row_groups.begin() + row_group_start, - _file_itm_data.row_groups.begin() + row_group_end, - _pass_itm_data->row_groups.begin()); - - auto const num_passes = _input_pass_row_group_offsets.size() - 1; - CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); - - auto const chunks_per_rowgroup = _input_columns.size(); - auto const num_chunks = chunks_per_rowgroup * num_row_groups; - - auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); - auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - - _pass_itm_data->chunks = - cudf::detail::hostdevice_vector(num_chunks, _stream); - std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); - - // adjust skip_rows and num_rows by what's available in the row groups we are processing - if (num_passes == 1) { - _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; - _pass_itm_data->num_rows = _file_itm_data.global_num_rows; - } else { - auto const global_start_row = _file_itm_data.global_skip_rows; - auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; - auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row); - auto const end_row = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row); - - // skip_rows is always global in the sense that it is relative to the first row of - // everything we will be reading, regardless of what pass we are on. - // num_rows is how many rows we are reading this pass. - _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass]; - _pass_itm_data->num_rows = end_row - start_row; - } -} - void reader::impl::load_and_decompress_data() { // This function should never be called if `num_rows == 0`. @@ -970,15 +757,16 @@ void reader::impl::load_and_decompress_data() // Process dataset chunk pages into output columns auto const total_pages = count_page_headers(chunks, _stream); if (total_pages <= 0) { return; } - pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); + pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); // decoding of column/page information _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream); + pages.device_to_host_sync(_stream); if (has_compressed_data) { decomp_page_data = decompress_page_data(chunks, pages, _stream); // Free compressed data for (size_t c = 0; c < chunks.size(); c++) { - if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } + if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } } } @@ -998,7 +786,6 @@ void reader::impl::load_and_decompress_data() // std::vector output_info = build_output_column_info(); // the following two allocate functions modify the page data - pages.device_to_host_sync(_stream); { // nesting information (sizes, etc) stored -per page- // note : even for flat schemas, we allocate 1 level of "nesting" info @@ -1019,14 +806,13 @@ struct cumulative_row_info { }; #if defined(PREPROCESS_DEBUG) -void print_pages(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view _stream) +void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view _stream) { pages.device_to_host_sync(_stream); for (size_t idx = 0; idx < pages.size(); idx++) { auto const& p = pages[idx]; // skip dictionary pages - if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } + if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } printf( "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), " "str_bytes(%d)\n", @@ -1039,372 +825,19 @@ void print_pages(cudf::detail::hostdevice_vector& pages, p.str_bytes); } } - -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, - rmm::device_uvector const& page_index, - rmm::device_uvector const& c_info, - rmm::cuda_stream_view stream) -{ - pages.device_to_host_sync(stream); - - printf("------------\nCumulative sizes by page\n"); - - std::vector schemas(pages.size()); - std::vector h_page_index(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); - std::vector h_cinfo(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); - auto schema_iter = cudf::detail::make_counting_transform_iterator( - 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); - thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); - auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); - schemas.resize(last - schemas.begin()); - printf("Num schemas: %lu\n", schemas.size()); - - for (size_t idx = 0; idx < schemas.size(); idx++) { - printf("Schema %d\n", schemas[idx]); - for (size_t pidx = 0; pidx < pages.size(); pidx++) { - auto const& page = pages[h_page_index[pidx]]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { - continue; - } - printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); - } - } -} - -void print_cumulative_row_info( - host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) -{ - if (splits.has_value()) { - printf("------------\nSplits\n"); - for (size_t idx = 0; idx < splits->size(); idx++) { - printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); - } - } - - printf("------------\nCumulative sizes %s\n", label.c_str()); - for (size_t idx = 0; idx < sizes.size(); idx++) { - printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); - if (splits.has_value()) { - // if we have a split at this row count and this is the last instance of this row count - auto start = thrust::make_transform_iterator( - splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); - auto end = start + splits->size(); - auto split = std::find(start, end, sizes[idx].row_count); - auto const split_index = [&]() -> int { - if (split != end && - ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { - return static_cast(std::distance(start, split)); - } - return idx == 0 ? 0 : -1; - }(); - if (split_index >= 0) { - printf(" <-- split {%lu, %lu}", - splits.value()[split_index].skip_rows, - splits.value()[split_index].num_rows); - } - } - printf("\n"); - } -} #endif // PREPROCESS_DEBUG -/** - * @brief Functor which reduces two cumulative_row_info structs of the same key. - */ -struct cumulative_row_sum { - cumulative_row_info operator() - __device__(cumulative_row_info const& a, cumulative_row_info const& b) const - { - return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; - } -}; - -/** - * @brief Functor which computes the total data size for a given type of cudf column. - * - * In the case of strings, the return size does not include the chars themselves. That - * information is tracked separately (see PageInfo::str_bytes). - */ -struct row_size_functor { - __device__ size_t validity_size(size_t num_rows, bool nullable) - { - return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; - } - - template - __device__ size_t operator()(size_t num_rows, bool nullable) - { - auto const element_size = sizeof(device_storage_type_t); - return (element_size * num_rows) + validity_size(num_rows, nullable); - } -}; - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - auto const offset_size = sizeof(size_type); - // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset - // for the entire column, whereas this is adding an extra offset per page. So we will get a - // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better - // to overestimate size somewhat than to underestimate it and potentially generate chunks - // that are too large. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - return validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - // only returns the size of offsets and validity. the size of the actual string chars - // is tracked separately. - auto const offset_size = sizeof(size_type); - // see note about offsets in the list_view template. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -/** - * @brief Functor which computes the total output cudf data size for all of - * the data in this page. - * - * Sums across all nesting levels. - */ -struct get_cumulative_row_info { - gpu::PageInfo const* const pages; - - __device__ cumulative_row_info operator()(size_type index) - { - auto const& page = pages[index]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { - return cumulative_row_info{0, 0, page.src_col_schema}; - } - - // total nested size, not counting string data - auto iter = - cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { - auto const& pni = page.nesting[i]; - return cudf::type_dispatcher( - data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); - }); - - size_t const row_count = static_cast(page.nesting[0].size); - return { - row_count, - thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, - page.src_col_schema}; - } -}; - -/** - * @brief Functor which computes the effective size of all input columns by page. - * - * For a given row, we want to find the cost of all pages for all columns involved - * in loading up to that row. The complication here is that not all pages are the - * same size between columns. Example: - * - * page row counts - * Column A: 0 <----> 100 <----> 200 - * Column B: 0 <---------------> 200 <--------> 400 - | - * if we decide to split at row 100, we don't really know the actual amount of bytes in column B - * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - * page. Essentially, a conservative over-estimate of the real size. - */ -struct row_total_size { - cumulative_row_info const* c_info; - size_type const* key_offsets; - size_t num_keys; - - __device__ cumulative_row_info operator()(cumulative_row_info const& i) - { - // sum sizes for each input column at this row - size_t sum = 0; - for (int idx = 0; idx < num_keys; idx++) { - auto const start = key_offsets[idx]; - auto const end = key_offsets[idx + 1]; - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [&] __device__(size_type i) { return c_info[i].row_count; }); - auto const page_index = - thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; - sum += c_info[page_index].size_bytes; - } - return {i.row_count, sum, i.key}; - } -}; - -/** - * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read - * limit, determine the set of splits. - * - * @param sizes Vector of cumulative {row_count, byte_size} pairs - * @param num_rows Total number of rows to read - * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns - */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) -{ - // now we have an array of {row_count, real output bytes}. just walk through it and generate - // splits. - // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch - // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; - { - size_t cur_pos = 0; - size_t cur_cumulative_size = 0; - size_t cur_row_count = 0; - auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { - return i.size_bytes - cur_cumulative_size; - }); - auto end = start + sizes.size(); - while (cur_row_count < num_rows) { - int64_t split_pos = - thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; - - // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back - // one. - if (static_cast(split_pos) >= sizes.size() || - (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { - split_pos--; - } - - // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in - // a loop because all of the cumulative sizes for all the pages are sorted into one big list. - // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in - // the list twice. so we have to iterate until we skip past all of them. The idea is that we - // either do this, or we have to call unique() on the input first. - while (split_pos < (static_cast(sizes.size()) - 1) && - (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { - split_pos++; - } - - auto const start_row = cur_row_count; - cur_row_count = sizes[split_pos].row_count; - splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); - cur_pos = split_pos; - cur_cumulative_size = sizes[split_pos].size_bytes; - } - } - // print_cumulative_row_info(sizes, "adjusted", splits); - - return splits; -} - -/** - * @brief Given a set of pages that have had their sizes computed by nesting level and - * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing - * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. - * - * @param pages All pages in the file - * @param id Additional intermediate information required to process the pages - * @param num_rows Total number of rows to read - * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns - * @param stream CUDA stream to use - */ -std::vector compute_splits( - cudf::detail::hostdevice_vector& pages, - gpu::pass_intermediate_data const& id, - size_t num_rows, - size_t chunk_read_limit, - rmm::cuda_stream_view stream) -{ - auto const& page_keys = id.page_keys; - auto const& page_index = id.page_index; - - // generate cumulative row counts and sizes - rmm::device_uvector c_info(page_keys.size(), stream); - // convert PageInfo to cumulative_row_info - auto page_input = thrust::make_transform_iterator(page_index.begin(), - get_cumulative_row_info{pages.device_ptr()}); - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - page_input, - c_info.begin(), - thrust::equal_to{}, - cumulative_row_sum{}); - // print_cumulative_page_info(pages, page_index, c_info, stream); - - // sort by row count - rmm::device_uvector c_info_sorted{c_info, stream}; - thrust::sort(rmm::exec_policy(stream), - c_info_sorted.begin(), - c_info_sorted.end(), - [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) { - return a.row_count < b.row_count; - }); - - // std::vector h_c_info_sorted(c_info_sorted.size()); - // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), - // c_info_sorted.data(), - // sizeof(cumulative_row_info) * c_info_sorted.size(), - // cudaMemcpyDefault)); - // print_cumulative_row_info(h_c_info_sorted, "raw"); - - // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per - // key - rmm::device_uvector key_offsets(page_keys.size() + 1, stream); - auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - thrust::make_constant_iterator(1), - thrust::make_discard_iterator(), - key_offsets.begin()) - .second; - size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); - thrust::exclusive_scan( - rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); - - // adjust the cumulative info such that for each row count, the size includes any pages that span - // that row count. this is so that if we have this case: - // page row counts - // Column A: 0 <----> 100 <----> 200 - // Column B: 0 <---------------> 200 <--------> 400 - // | - // if we decide to split at row 100, we don't really know the actual amount of bytes in column B - // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - // page. - // - rmm::device_uvector aggregated_info(c_info.size(), stream); - thrust::transform(rmm::exec_policy(stream), - c_info_sorted.begin(), - c_info_sorted.end(), - aggregated_info.begin(), - row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); - - // bring back to the cpu - std::vector h_aggregated_info(aggregated_info.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), - aggregated_info.data(), - sizeof(cumulative_row_info) * c_info.size(), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - - return find_splits(h_aggregated_info, num_rows, chunk_read_limit); -} - struct get_page_chunk_idx { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; } + __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; } }; struct get_page_num_rows { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; } + __device__ size_type operator()(PageInfo const& page) { return page.num_rows; } }; struct get_page_column_index { - gpu::ColumnChunkDesc const* chunks; - __device__ size_type operator()(gpu::PageInfo const& page) + ColumnChunkDesc const* chunks; + __device__ size_type operator()(PageInfo const& page) { return chunks[page.chunk_idx].src_col_index; } @@ -1441,7 +874,7 @@ struct get_page_nesting_size { input_col_info const* const input_cols; size_type const max_depth; size_t const num_pages; - gpu::PageInfo const* const pages; + PageInfo const* const pages; int const* page_indices; __device__ size_type operator()(size_t index) const @@ -1450,7 +883,7 @@ struct get_page_nesting_size { auto const& page = pages[page_indices[indices.page_idx]]; if (page.src_col_schema != input_cols[indices.col_idx].schema_idx || - page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + page.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return 0; } @@ -1468,7 +901,7 @@ struct get_reduction_key { * @brief Writes to the chunk_row field of the PageInfo struct. */ struct chunk_row_output_iter { - gpu::PageInfo* p; + PageInfo* p; using value_type = size_type; using difference_type = size_type; using pointer = size_type*; @@ -1490,7 +923,7 @@ struct chunk_row_output_iter { * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema. */ struct start_offset_output_iterator { - gpu::PageInfo const* pages; + PageInfo const* pages; int const* page_indices; size_t cur_index; input_col_info const* input_cols; @@ -1529,9 +962,9 @@ struct start_offset_output_iterator { { auto const indices = reduction_indices{index, max_depth, num_pages}; - gpu::PageInfo const& p = pages[page_indices[indices.page_idx]]; + PageInfo const& p = pages[page_indices[indices.page_idx]]; if (p.src_col_schema != input_cols[indices.col_idx].schema_idx || - p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + p.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return empty; } @@ -1540,15 +973,15 @@ struct start_offset_output_iterator { }; struct flat_column_num_rows { - gpu::PageInfo const* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo const* pages; + ColumnChunkDesc const* chunks; __device__ size_type operator()(size_type pindex) const { - gpu::PageInfo const& page = pages[pindex]; + PageInfo const& page = pages[pindex]; // ignore dictionary pages and pages belonging to any column containing repetition (lists) - if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) || - (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) { + if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) || + (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) { return 0; } return page.num_rows; @@ -1581,8 +1014,8 @@ struct row_counts_different { * @param expected_row_count Expected row count, if applicable * @param stream CUDA stream used for device memory operations and kernel launches */ -void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, device_span page_keys, device_span page_index, std::optional expected_row_count, @@ -1631,23 +1064,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector& page } struct page_to_string_size { - gpu::PageInfo* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo* pages; + ColumnChunkDesc const* chunks; __device__ size_t operator()(size_type page_idx) const { auto const page = pages[page_idx]; auto const chunk = chunks[page.chunk_idx]; - if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) { - return 0; - } + if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; } return pages[page_idx].str_bytes; } }; struct page_offset_output_iter { - gpu::PageInfo* p; + PageInfo* p; size_type const* index; using value_type = size_type; @@ -1738,7 +1169,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; break; @@ -1749,7 +1180,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // generate string dict indices if necessary { - auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) { + auto is_dict_chunk = [](ColumnChunkDesc const& chunk) { return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; }; @@ -1785,7 +1216,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re if (total_str_dict_indexes > 0) { chunks.host_to_device_async(_stream); - gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); + BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); } } @@ -1800,14 +1231,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // if: // - user has passed custom row bounds // - we will be doing a chunked read - gpu::ComputePageSizes(pages, - chunks, - 0, // 0-max size_t. process all possible rows - std::numeric_limits::max(), - true, // compute num_rows - chunk_read_limit > 0, // compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + 0, // 0-max size_t. process all possible rows + std::numeric_limits::max(), + true, // compute num_rows + chunk_read_limit > 0, // compute string sizes + _pass_itm_data->level_type_size, + _stream); // computes: // PageInfo::chunk_row (the absolute start row index) for all pages @@ -1831,12 +1262,8 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re _pass_itm_data->page_keys = std::move(page_keys); _pass_itm_data->page_index = std::move(page_index); - // compute splits if necessary. otherwise return a single split representing - // the whole file. - _pass_itm_data->output_chunk_read_info = - _output_chunk_read_limit > 0 - ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream) - : std::vector{{skip_rows, num_rows}}; + // compute splits for the pass + compute_splits_for_pass(); } void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) @@ -1853,14 +1280,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds // is set (if the user has specified artificial bounds). if (uses_custom_row_bounds) { - gpu::ComputePageSizes(pages, - chunks, - skip_rows, - num_rows, - false, // num_rows is already computed - false, // no need to compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + skip_rows, + num_rows, + false, // num_rows is already computed + false, // no need to compute string sizes + _pass_itm_data->level_type_size, + _stream); // print_pages(pages, _stream); } @@ -1879,7 +1306,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; } @@ -1989,7 +1416,7 @@ std::vector reader::impl::calculate_page_string_offsets() page_index.begin(), page_to_string_size{pages.device_ptr(), chunks.device_ptr()}); // do scan by key to calculate string offsets for each page - thrust::exclusive_scan_by_key(rmm::exec_policy(_stream), + thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream), page_keys.begin(), page_keys.end(), val_iter, @@ -1997,7 +1424,7 @@ std::vector reader::impl::calculate_page_string_offsets() // now sum up page sizes rmm::device_uvector reduce_keys(col_sizes.size(), _stream); - thrust::reduce_by_key(rmm::exec_policy(_stream), + thrust::reduce_by_key(rmm::exec_policy_nosync(_stream), page_keys.begin(), page_keys.end(), val_iter, @@ -2014,4 +1441,4 @@ std::vector reader::impl::calculate_page_string_offsets() return col_sizes; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 2545a074a38..799d6d9fd64 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -20,7 +20,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { template constexpr int rle_stream_required_run_buffer_size() @@ -362,4 +362,4 @@ struct rle_stream { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a124f352ee4..c2b10e09b1a 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -54,12 +54,9 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { -using namespace cudf::io::parquet; -using namespace cudf::io; +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, @@ -185,13 +182,13 @@ namespace { * @param compression The compression type * @return The supported Parquet compression */ -parquet::Compression to_parquet_compression(compression_type compression) +Compression to_parquet_compression(compression_type compression) { switch (compression) { case compression_type::AUTO: - case compression_type::SNAPPY: return parquet::Compression::SNAPPY; - case compression_type::ZSTD: return parquet::Compression::ZSTD; - case compression_type::NONE: return parquet::Compression::UNCOMPRESSED; + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::NONE: return Compression::UNCOMPRESSED; default: CUDF_FAIL("Unsupported compression type"); } } @@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) { for (uint8_t enc = 0; enc < static_cast(Encoding::NUM_ENCODINGS); enc++) { auto const enc_enum = static_cast(enc); - if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } + if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } } } @@ -281,12 +278,14 @@ struct leaf_schema_fn { cudf::detail::LinkedColPtr const& col; column_in_metadata const& col_meta; bool timestamp_is_int96; + bool timestamp_is_utc; template std::enable_if_t, void> operator()() { col_schema.type = Type::BOOLEAN; col_schema.stats_dtype = statistics_dtype::dtype_bool; + // BOOLEAN needs no converted or logical type } template @@ -295,6 +294,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::INT_8; col_schema.stats_dtype = statistics_dtype::dtype_int8; + col_schema.logical_type = LogicalType{IntType{8, true}}; } template @@ -303,6 +303,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::INT_16; col_schema.stats_dtype = statistics_dtype::dtype_int16; + col_schema.logical_type = LogicalType{IntType{16, true}}; } template @@ -310,6 +311,7 @@ struct leaf_schema_fn { { col_schema.type = Type::INT32; col_schema.stats_dtype = statistics_dtype::dtype_int32; + // INT32 needs no converted or logical type } template @@ -317,6 +319,7 @@ struct leaf_schema_fn { { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + // INT64 needs no converted or logical type } template @@ -325,6 +328,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::UINT_8; col_schema.stats_dtype = statistics_dtype::dtype_int8; + col_schema.logical_type = LogicalType{IntType{8, false}}; } template @@ -333,6 +337,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::UINT_16; col_schema.stats_dtype = statistics_dtype::dtype_int16; + col_schema.logical_type = LogicalType{IntType{16, false}}; } template @@ -341,6 +346,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::UINT_32; col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type = LogicalType{IntType{32, false}}; } template @@ -349,6 +355,7 @@ struct leaf_schema_fn { col_schema.type = Type::INT64; col_schema.converted_type = ConvertedType::UINT_64; col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{IntType{64, false}}; } template @@ -356,6 +363,7 @@ struct leaf_schema_fn { { col_schema.type = Type::FLOAT; col_schema.stats_dtype = statistics_dtype::dtype_float32; + // FLOAT needs no converted or logical type } template @@ -363,6 +371,7 @@ struct leaf_schema_fn { { col_schema.type = Type::DOUBLE; col_schema.stats_dtype = statistics_dtype::dtype_float64; + // DOUBLE needs no converted or logical type } template @@ -370,11 +379,12 @@ struct leaf_schema_fn { { col_schema.type = Type::BYTE_ARRAY; if (col_meta.is_enabled_output_as_binary()) { - col_schema.converted_type = ConvertedType::UNKNOWN; - col_schema.stats_dtype = statistics_dtype::dtype_byte_array; + col_schema.stats_dtype = statistics_dtype::dtype_byte_array; + // BYTE_ARRAY needs no converted or logical type } else { col_schema.converted_type = ConvertedType::UTF8; col_schema.stats_dtype = statistics_dtype::dtype_string; + col_schema.logical_type = LogicalType{LogicalType::STRING}; } } @@ -384,49 +394,55 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.converted_type = ConvertedType::DATE; col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type = LogicalType{LogicalType::DATE}; } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = - (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; col_schema.ts_scale = 1000; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = - (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MILLIS; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MILLIS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MILLIS}}; + } } template std::enable_if_t, void> operator()() { - col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = - (timestamp_is_int96) ? ConvertedType::UNKNOWN : ConvertedType::TIMESTAMP_MICROS; + col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; + if (not timestamp_is_int96) { + col_schema.converted_type = ConvertedType::TIMESTAMP_MICROS; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::MICROS}}; + } } template std::enable_if_t, void> operator()() { col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = ConvertedType::UNKNOWN; + col_schema.converted_type = thrust::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; if (timestamp_is_int96) { col_schema.ts_scale = -1000; // negative value indicates division by absolute value } // set logical type if it's not int96 else { - col_schema.logical_type.isset.TIMESTAMP = true; - col_schema.logical_type.TIMESTAMP.unit.isset.NANOS = true; + col_schema.logical_type = LogicalType{TimestampType{timestamp_is_utc, TimeUnit::NANOS}}; } } @@ -434,53 +450,48 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 24 * 60 * 60 * 1000; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MILLIS = true; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 24 * 60 * 60 * 1000; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.ts_scale = 1000; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MILLIS = true; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 1000; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int32; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MILLIS = true; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}}; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MICROS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.MICROS = true; + col_schema.type = Type::INT64; + col_schema.converted_type = ConvertedType::TIME_MICROS; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}}; } // unsupported outside cudf for parquet 1.0. template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.logical_type.isset.TIME = true; - col_schema.logical_type.TIME.unit.isset.NANOS = true; + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}}; } template @@ -490,27 +501,32 @@ struct leaf_schema_fn { col_schema.type = Type::INT32; col_schema.stats_dtype = statistics_dtype::dtype_int32; col_schema.decimal_precision = MAX_DECIMAL32_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}}; } else if (std::is_same_v) { col_schema.type = Type::INT64; col_schema.stats_dtype = statistics_dtype::dtype_decimal64; col_schema.decimal_precision = MAX_DECIMAL64_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}}; } else if (std::is_same_v) { col_schema.type = Type::FIXED_LEN_BYTE_ARRAY; col_schema.type_length = sizeof(__int128_t); col_schema.stats_dtype = statistics_dtype::dtype_decimal128; col_schema.decimal_precision = MAX_DECIMAL128_PRECISION; + col_schema.logical_type = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}}; } else { CUDF_FAIL("Unsupported fixed point type for parquet writer"); } col_schema.converted_type = ConvertedType::DECIMAL; col_schema.decimal_scale = -col->type().scale(); // parquet and cudf disagree about scale signs + col_schema.logical_type->decimal_type->scale = -col->type().scale(); if (col_meta.is_decimal_precision_set()) { CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale, "Precision must be equal to or greater than scale!"); if (col_schema.type == Type::INT64 and col_meta.get_decimal_precision() < 10) { CUDF_LOG_WARN("Parquet writer: writing a decimal column with precision < 10 as int64"); } - col_schema.decimal_precision = col_meta.get_decimal_precision(); + col_schema.decimal_precision = col_meta.get_decimal_precision(); + col_schema.logical_type->decimal_type->precision = col_meta.get_decimal_precision(); } } @@ -552,7 +568,8 @@ std::vector construct_schema_tree( cudf::detail::LinkedColVector const& linked_columns, table_input_metadata& metadata, single_write_mode write_mode, - bool int96_timestamps) + bool int96_timestamps, + bool utc_timestamps) { std::vector schema; schema_tree_node root{}; @@ -596,7 +613,7 @@ std::vector construct_schema_tree( schema_tree_node col_schema{}; col_schema.type = Type::BYTE_ARRAY; - col_schema.converted_type = ConvertedType::UNKNOWN; + col_schema.converted_type = thrust::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_byte_array; col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -724,8 +741,9 @@ std::vector construct_schema_tree( bool timestamp_is_int96 = int96_timestamps or col_meta.is_enabled_int96_timestamps(); - cudf::type_dispatcher(col->type(), - leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96}); + cudf::type_dispatcher( + col->type(), + leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps}); col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -761,11 +779,14 @@ struct parquet_column_view { std::vector const& schema_tree, rmm::cuda_stream_view stream); - [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; + [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; [[nodiscard]] column_view cudf_column_view() const { return cudf_col; } - [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; } - [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; } + [[nodiscard]] Type physical_type() const { return schema_node.type; } + [[nodiscard]] ConvertedType converted_type() const + { + return schema_node.converted_type.value_or(UNKNOWN); + } std::vector const& get_path_in_schema() { return path_in_schema; } @@ -846,11 +867,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, uint16_t max_rep_level = 0; curr_schema_node = schema_node; while (curr_schema_node.parent_idx != -1) { - if (curr_schema_node.repetition_type == parquet::REPEATED or - curr_schema_node.repetition_type == parquet::OPTIONAL) { + if (curr_schema_node.repetition_type == REPEATED or + curr_schema_node.repetition_type == OPTIONAL) { ++max_def_level; } - if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; } + if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; } curr_schema_node = schema_tree[curr_schema_node.parent_idx]; } CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported"); @@ -897,9 +918,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, } } -gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const +parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const { - auto desc = gpu::parquet_column_device_view{}; // Zero out all fields + auto desc = parquet_column_device_view{}; // Zero out all fields desc.stats_dtype = schema_node.stats_dtype; desc.ts_scale = schema_node.ts_scale; @@ -931,8 +952,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s * @param fragment_size Number of rows per fragment * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, - device_span col_desc, +void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, + device_span col_desc, host_span partitions, device_span part_frag_offset, uint32_t fragment_size, @@ -940,7 +961,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector frag, +void calculate_page_fragments(device_span frag, host_span frag_sizes, rmm::cuda_stream_view stream) { auto d_frag_sz = cudf::detail::make_device_uvector_async( frag_sizes, stream, rmm::mr::get_current_device_resource()); - gpu::CalculatePageFragments(frag, d_frag_sz, stream); + CalculatePageFragments(frag, d_frag_sz, stream); } /** @@ -972,13 +993,13 @@ void calculate_page_fragments(device_span frag, * @param stream CUDA stream used for device memory operations and kernel launches */ void gather_fragment_statistics(device_span frag_stats, - device_span frags, + device_span frags, bool int96_timestamps, rmm::cuda_stream_view stream) { rmm::device_uvector frag_stats_group(frag_stats.size(), stream); - gpu::InitFragmentStatistics(frag_stats_group, frags, stream); + InitFragmentStatistics(frag_stats_group, frags, stream); detail::calculate_group_statistics( frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps); stream.synchronize(); @@ -1008,8 +1029,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); } -auto init_page_sizes(hostdevice_2dvector& chunks, - device_span col_desc, +auto init_page_sizes(hostdevice_2dvector& chunks, + device_span col_desc, uint32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, @@ -1021,19 +1042,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); // Calculate number of pages and store in respective chunks - gpu::InitEncoderPages(chunks, - {}, - {}, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); int num_pages = 0; @@ -1046,19 +1067,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Now that we know the number of pages, allocate an array to hold per page size and get it // populated cudf::detail::hostdevice_vector page_sizes(num_pages, stream); - gpu::InitEncoderPages(chunks, - {}, - page_sizes, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + page_sizes, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); page_sizes.device_to_host_sync(stream); // Get per-page max compressed size @@ -1072,26 +1093,26 @@ auto init_page_sizes(hostdevice_2dvector& chunks, comp_page_sizes.host_to_device_async(stream); // Use per-page max compressed size to calculate chunk.compressed_size - gpu::InitEncoderPages(chunks, - {}, - {}, - comp_page_sizes, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + comp_page_sizes, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); return comp_page_sizes; } size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) { - if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; } + if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; } auto const ncomp_type = to_nvcomp_compression_type(compression); auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) @@ -1104,9 +1125,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) } std::pair>, std::vector>> -build_chunk_dictionaries(hostdevice_2dvector& chunks, - host_span col_desc, - device_2dspan frags, +build_chunk_dictionaries(hostdevice_2dvector& chunks, + host_span col_desc, + device_2dspan frags, Compression compression, dictionary_policy dict_policy, size_t max_dict_size, @@ -1130,7 +1151,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } // Allocate slots for each chunk - std::vector> hash_maps_storage; + std::vector> hash_maps_storage; hash_maps_storage.reserve(h_chunks.size()); for (auto& chunk : h_chunks) { if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN || @@ -1149,8 +1170,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); - gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); - gpu::populate_chunk_hash_maps(frags, stream); + initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); + populate_chunk_hash_maps(frags, stream); chunks.device_to_host_sync(stream); @@ -1197,8 +1218,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - gpu::collect_map_entries(chunks.device_view().flat_view(), stream); - gpu::get_dictionary_indices(frags, stream); + collect_map_entries(chunks.device_view().flat_view(), stream); + get_dictionary_indices(frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } @@ -1221,9 +1242,9 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, * @param write_v2_headers True if version 2 page headers are to be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_encoder_pages(hostdevice_2dvector& chunks, - device_span col_desc, - device_span pages, +void init_encoder_pages(hostdevice_2dvector& chunks, + device_span col_desc, + device_span pages, cudf::detail::hostdevice_vector& comp_page_sizes, statistics_chunk* page_stats, statistics_chunk* frag_stats, @@ -1286,8 +1307,8 @@ void init_encoder_pages(hostdevice_2dvector& chunks, * @param write_v2_headers True if V2 page headers should be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void encode_pages(hostdevice_2dvector& chunks, - device_span pages, +void encode_pages(hostdevice_2dvector& chunks, + device_span pages, uint32_t pages_in_batch, uint32_t first_page_in_batch, uint32_t rowgroups_in_batch, @@ -1308,8 +1329,7 @@ void encode_pages(hostdevice_2dvector& chunks, ? device_span(page_stats + first_page_in_batch, pages_in_batch) : device_span(); - uint32_t max_comp_pages = - (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0; + uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1319,9 +1339,9 @@ void encode_pages(hostdevice_2dvector& chunks, comp_res.end(), compression_result{0, compression_status::FAILURE}); - gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); + EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); switch (compression) { - case parquet::Compression::SNAPPY: + case Compression::SNAPPY: if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { gpu_snap(comp_in, comp_out, comp_res, stream); } else { @@ -1329,7 +1349,7 @@ void encode_pages(hostdevice_2dvector& chunks, nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); } break; - case parquet::Compression::ZSTD: { + case Compression::ZSTD: { if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); reason) { CUDF_FAIL("Compression error: " + reason.value()); @@ -1338,7 +1358,7 @@ void encode_pages(hostdevice_2dvector& chunks, break; } - case parquet::Compression::UNCOMPRESSED: break; + case Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } @@ -1378,7 +1398,7 @@ void encode_pages(hostdevice_2dvector& chunks, * @param column_index_truncate_length maximum length of min or max values in column index, in bytes * @return Computed buffer size needed to encode the column index */ -size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length) +size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length) { // encoding the column index for a given chunk requires: // each list (4 of them) requires 6 bytes of overhead @@ -1450,6 +1470,7 @@ void fill_table_meta(std::unique_ptr const& table_meta) * @param max_dictionary_size Maximum dictionary size, in bytes * @param single_write_mode Flag to indicate that we are guaranteeing a single table write * @param int96_timestamps Flag to indicate if timestamps will be written as INT96 + * @param utc_timestamps Flag to indicate if timestamps are UTC * @param write_v2_headers True if V2 page headers are to be written * @param out_sink Sink for checking if device write is supported, should not be used to write any * data in this function @@ -1474,12 +1495,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_t max_dictionary_size, single_write_mode write_mode, bool int96_timestamps, + bool utc_timestamps, bool write_v2_headers, host_span const> out_sink, rmm::cuda_stream_view stream) { - auto vec = table_to_linked_columns(input); - auto schema_tree = construct_schema_tree(vec, table_meta, write_mode, int96_timestamps); + auto vec = table_to_linked_columns(input); + auto schema_tree = + construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps); // Construct parquet_column_views from the schema tree leaf nodes. std::vector parquet_columns; @@ -1499,8 +1522,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::vector this_table_schema(schema_tree.begin(), schema_tree.end()); // Initialize column description - cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), - stream); + cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), + stream); std::transform( parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) { return pcol.get_device_view(stream); @@ -1576,7 +1599,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto d_part_frag_offset = cudf::detail::make_device_uvector_async( part_frag_offset, stream, rmm::mr::get_current_device_resource()); - cudf::detail::hostdevice_2dvector row_group_fragments( + cudf::detail::hostdevice_2dvector row_group_fragments( num_columns, num_fragments, stream); // Create table_device_view so that corresponding column_device_view data @@ -1588,7 +1611,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (num_fragments != 0) { // Move column info to device col_desc.host_to_device_async(stream); - leaf_column_views = create_leaf_column_device_views( + leaf_column_views = create_leaf_column_device_views( col_desc, *parent_column_table_device_view, stream); init_row_group_fragments(row_group_fragments, @@ -1662,7 +1685,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // Initialize row groups and column chunks auto const num_chunks = num_rowgroups * num_columns; - hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); + hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); // total fragments per column (in case they are non-uniform) std::vector frags_per_column(num_columns, 0); @@ -1678,7 +1701,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, row_group.total_byte_size = 0; row_group.columns.resize(num_columns); for (int c = 0; c < num_columns; c++) { - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; ck = {}; ck.col_desc = col_desc.device_ptr() + c; @@ -1700,7 +1723,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, return l + r.num_values; }); ck.plain_data_size = std::accumulate( - chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { + chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) { return sum + frag.fragment_data_size; }); auto& column_chunk_meta = row_group.columns[c].meta_data; @@ -1731,7 +1754,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back(); rmm::device_uvector frag_stats(0, stream); - cudf::detail::hostdevice_vector page_fragments(total_frags, stream); + cudf::detail::hostdevice_vector page_fragments(total_frags, stream); // update fragments and/or prepare for fragment statistics calculation if necessary if (total_frags != 0) { @@ -1749,9 +1772,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const& row_group = agg_meta->file(p).row_groups[global_r]; uint32_t const fragments_in_chunk = util::div_rounding_up_unsafe(row_group.num_rows, frag_size); - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; - ck.fragments = page_fragments.device_ptr(frag_offset); - ck.first_fragment = frag_offset; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + ck.fragments = page_fragments.device_ptr(frag_offset); + ck.first_fragment = frag_offset; // update the chunk pointer here for each fragment in chunk.fragments for (uint32_t i = 0; i < fragments_in_chunk; i++) { @@ -1817,8 +1840,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_t comp_rowgroup_size = 0; if (r < num_rowgroups) { for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; - ck->first_page = num_pages; + EncColumnChunk* ck = &chunks[r][i]; + ck->first_page = num_pages; num_pages += ck->num_pages; pages_in_batch += ck->num_pages; rowgroup_size += ck->bfr_size; @@ -1850,7 +1873,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } // Clear compressed buffer size if compression has been turned off - if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } + if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } // Initialize data pointers in batch uint32_t const num_stats_bfr = @@ -1864,7 +1887,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, stream); rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream); - rmm::device_uvector pages(num_pages, stream); + rmm::device_uvector pages(num_pages, stream); // This contains stats for both the pages and the rowgroups. TODO: make them separate. rmm::device_uvector page_stats(num_stats_bfr, stream); @@ -1874,10 +1897,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto bfr_c = static_cast(comp_bfr.data()); for (auto j = 0; j < batch_list[b]; j++, r++) { for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk& ck = chunks[r][i]; - ck.uncompressed_bfr = bfr; - ck.compressed_bfr = bfr_c; - ck.column_index_blob = bfr_i; + EncColumnChunk& ck = chunks[r][i]; + ck.uncompressed_bfr = bfr; + ck.compressed_bfr = bfr_c; + ck.column_index_blob = bfr_i; bfr += ck.bfr_size; bfr_c += ck.compressed_size; if (stats_granularity == statistics_freq::STATISTICS_COLUMN) { @@ -1960,7 +1983,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (ck.ck_stat_size != 0) { std::vector const stats_blob = cudf::detail::make_std_vector_sync( device_span(dev_bfr, ck.ck_stat_size), stream); - cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); + CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); cp.read(&column_chunk_meta.statistics); need_sync = true; } @@ -2009,6 +2032,7 @@ writer::impl::impl(std::vector> sinks, _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), _int96_timestamps(options.is_enabled_int96_timestamps()), + _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _column_index_truncate_length(options.get_column_index_truncate_length()), _kv_meta(options.get_key_value_metadata()), @@ -2037,6 +2061,7 @@ writer::impl::impl(std::vector> sinks, _max_dictionary_size(options.get_max_dictionary_size()), _max_page_fragment_size(options.get_max_page_fragment_size()), _int96_timestamps(options.is_enabled_int96_timestamps()), + _utc_timestamps(options.is_enabled_utc_timestamps()), _write_v2_headers(options.is_enabled_write_v2_headers()), _column_index_truncate_length(options.get_column_index_truncate_length()), _kv_meta(options.get_key_value_metadata()), @@ -2114,6 +2139,7 @@ void writer::impl::write(table_view const& input, std::vector co _max_dictionary_size, _single_write_mode, _int96_timestamps, + _utc_timestamps, _write_v2_headers, _out_sink, _stream); @@ -2142,8 +2168,8 @@ void writer::impl::write(table_view const& input, std::vector co void writer::impl::write_parquet_data_to_sink( std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -2209,7 +2235,7 @@ void writer::impl::write_parquet_data_to_sink( int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto const& row_group = _agg_meta->file(p).row_groups[global_r]; for (std::size_t i = 0; i < num_columns; i++) { - gpu::EncColumnChunk const& ck = chunks[r][i]; + EncColumnChunk const& ck = chunks[r][i]; auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index @@ -2377,6 +2403,15 @@ std::unique_ptr> writer::merge_row_group_metadata( } } + // Remove any LogicalType::UNKNOWN annotations that were passed in as they can confuse + // column type inferencing. + // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615 + for (auto& se : md.schema) { + if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) { + se.logical_type = thrust::nullopt; + } + } + // Thrift-encode the resulting output file_header_s fhdr; file_ender_s fendr; @@ -2392,7 +2427,4 @@ std::unique_ptr> writer::merge_row_group_metadata( return std::make_unique>(std::move(output)); } -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 89ef85ba2bd..3415205d179 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -38,15 +38,11 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { +namespace cudf::io::parquet::detail { + // Forward internal classes struct aggregate_writer_metadata; -using namespace cudf::io::parquet; -using namespace cudf::io; using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; using cudf::detail::hostdevice_2dvector; @@ -66,7 +62,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -79,7 +75,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -139,8 +135,8 @@ class writer::impl { * @param[out] bounce_buffer Temporary host output buffer */ void write_parquet_data_to_sink(std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -161,12 +157,14 @@ class writer::impl { size_t const _max_dictionary_size; std::optional const _max_page_fragment_size; bool const _int96_timestamps; + bool const _utc_timestamps; bool const _write_v2_headers; int32_t const _column_index_truncate_length; std::vector> const _kv_meta; // Optional user metadata. - single_write_mode const _single_write_mode; // Special parameter only used by `write()` to - // indicate that we are guaranteeing a single table - // write. This enables some internal optimizations. + cudf::io::detail::single_write_mode const + _single_write_mode; // Special parameter only used by `write()` to + // indicate that we are guaranteeing a single table + // write. This enables some internal optimizations. std::vector> const _out_sink; // Internal states, filled during `write()` and written to sink during `write` and `close()`. @@ -180,7 +178,4 @@ class writer::impl { bool _closed = false; // To track if the output has been written to sink. }; -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index f3a43cbc63c..dd049d401cf 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -51,19 +51,21 @@ std::unique_ptr gather_column_buffer::make_string_column_impl(rmm::cuda_ return make_strings_column(*_strings, stream, _mr); } -void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) +void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) { CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column"); // size + 1 for final offset. _string_data will be initialized later. _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr); } -void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream) +void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes, + rmm::cuda_stream_view stream) { _string_data = rmm::device_buffer(num_bytes, stream, _mr); } -std::unique_ptr inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream) +std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_column_impl( + rmm::cuda_stream_view stream) { // no need for copies, just transfer ownership of the data_buffers to the columns auto const state = mask_state::UNALLOCATED; @@ -324,7 +326,7 @@ std::unique_ptr empty_like(column_buffer_base& buffer, } using pointer_type = gather_column_buffer; -using string_type = inline_column_buffer; +using string_type = cudf::io::detail::inline_column_buffer; using pointer_column_buffer = column_buffer_base; using string_column_buffer = column_buffer_base; diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 7a7121aa91d..a466ef84133 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -360,6 +360,11 @@ class user_datasource_wrapper : public datasource { return source->supports_device_read(); } + [[nodiscard]] bool is_device_read_preferred(size_t size) const override + { + return source->is_device_read_preferred(size); + } + size_t device_read(size_t offset, size_t size, uint8_t* dst, @@ -375,8 +380,18 @@ class user_datasource_wrapper : public datasource { return source->device_read(offset, size, stream); } + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + return source->device_read_async(offset, size, dst, stream); + } + [[nodiscard]] size_t size() const override { return source->size(); } + [[nodiscard]] bool is_empty() const override { return source->is_empty(); } + private: datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp index 1bc126d3be9..e59c1089318 100644 --- a/cpp/src/jit/parser.cpp +++ b/cpp/src/jit/parser.cpp @@ -114,6 +114,7 @@ std::string ptx_parser::parse_instruction(std::string const& src) size_t start = 0; size_t stop = 0; bool is_instruction = true; + bool is_pragma_instruction = false; bool is_param_loading_instruction = false; std::string constraint; std::string register_type; @@ -181,6 +182,9 @@ std::string ptx_parser::parse_instruction(std::string const& src) "value through the first function parameter. Thus the `st.param.***` instructions " "are not processed. *** */" + "\");" + original_code; // Our port does not support return value; + } else if (piece.find(".pragma") != std::string::npos) { + is_pragma_instruction = true; + output += " " + piece; } else if (piece[0] == '@') { output += " @" + remove_nonalphanumeric(piece.substr(1, piece.size() - 1)); } else { @@ -200,6 +204,17 @@ std::string ptx_parser::parse_instruction(std::string const& src) } // Here we get to see the actual type of the input arguments. input_arg_list[remove_nonalphanumeric(piece)] = register_type_to_cpp_type(register_type); + } else if (is_pragma_instruction) { + // quote any string + std::string transformed_piece; + for (const auto& c : piece) { + if (c == '"') { + transformed_piece += "\\\""; + } else { + transformed_piece += c; + } + } + output += transformed_piece; } else { output += escape_percent(std::string(src, start, stop - start)); } diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/json/json_path.cu similarity index 98% rename from cpp/src/strings/json/json_path.cu rename to cpp/src/json/json_path.cu index c56752f5429..8217e34723c 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -20,9 +20,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -41,7 +41,6 @@ #include namespace cudf { -namespace strings { namespace detail { namespace { @@ -224,7 +223,9 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; class json_state : private parser { public: __device__ json_state() : parser() {} - __device__ json_state(char const* _input, int64_t _input_len, get_json_object_options _options) + __device__ json_state(char const* _input, + int64_t _input_len, + cudf::get_json_object_options _options) : parser(_input, _input_len), options(_options) @@ -956,9 +957,6 @@ __launch_bounds__(block_size) __global__ } } -/** - * @copydoc cudf::strings::detail::get_json_object - */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, get_json_object_options options, @@ -1011,7 +1009,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::get_value(offsets_view, col.size(), stream); // allocate output string column - auto chars = create_chars_child_column(output_size, stream, mr); + auto chars = cudf::strings::detail::create_chars_child_column(output_size, stream, mr); // potential optimization : if we know that all outputs are valid, we could skip creating // the validity mask altogether @@ -1041,17 +1039,14 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c } // namespace } // namespace detail -/** - * @copydoc cudf::strings::get_json_object - */ std::unique_ptr get_json_object(cudf::strings_column_view const& col, cudf::string_scalar const& json_path, get_json_object_options options, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::get_json_object(col, json_path, options, cudf::get_default_stream(), mr); + return detail::get_json_object(col, json_path, options, stream, mr); } -} // namespace strings } // namespace cudf diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index fbe297765f8..99dbd55678b 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -271,10 +271,11 @@ std::unique_ptr concatenate_list_elements(column_view const& input, */ std::unique_ptr concatenate_list_elements(column_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_list_elements(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu index 658538b0195..49be7b5ff17 100644 --- a/cpp/src/lists/combine/concatenate_rows.cu +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -305,10 +305,11 @@ std::unique_ptr concatenate_rows(table_view const& input, */ std::unique_ptr concatenate_rows(table_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_rows(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index df1d043bdb6..cd2bc493bc7 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -274,12 +275,13 @@ std::unique_ptr index_of(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { if (!search_key.is_valid(stream)) { - return make_numeric_column(data_type{cudf::type_to_id()}, - lists.size(), - cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr), - lists.size(), - stream, - mr); + return make_numeric_column( + data_type{cudf::type_to_id()}, + lists.size(), + cudf::detail::create_null_mask(lists.size(), mask_state::ALL_NULL, stream, mr), + lists.size(), + stream, + mr); } if (lists.size() == 0) { return make_numeric_column( @@ -287,7 +289,7 @@ std::unique_ptr index_of(lists_column_view const& lists, } auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr); - return index_of(lists, search_key_col->view(), find_option, stream, mr); + return detail::index_of(lists, search_key_col->view(), find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, @@ -306,11 +308,11 @@ std::unique_ptr contains(lists_column_view const& lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto key_indices = index_of(lists, - search_key, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_key, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -322,11 +324,11 @@ std::unique_ptr contains(lists_column_view const& lists, CUDF_EXPECTS(search_keys.size() == lists.size(), "Number of search keys must match list column size."); - auto key_indices = index_of(lists, - search_keys, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_keys, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -337,7 +339,7 @@ std::unique_ptr contains_nulls(lists_column_view const& lists, auto const lists_cv = lists.parent(); auto output = make_numeric_column(data_type{type_to_id()}, lists.size(), - copy_bitmask(lists_cv, stream, mr), + cudf::detail::copy_bitmask(lists_cv, stream, mr), lists_cv.null_count(), stream, mr); @@ -364,43 +366,48 @@ std::unique_ptr contains_nulls(lists_column_view const& lists, std::unique_ptr contains(lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_key, cudf::get_default_stream(), mr); + return detail::contains(lists, search_key, stream, mr); } std::unique_ptr contains(lists_column_view const& lists, column_view const& search_keys, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_keys, cudf::get_default_stream(), mr); + return detail::contains(lists, search_keys, stream, mr); } std::unique_ptr contains_nulls(lists_column_view const& lists, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_nulls(lists, cudf::get_default_stream(), mr); + return detail::contains_nulls(lists, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_key, find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, column_view const& search_keys, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_keys, find_option, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu index ddd0dfbe084..5407b88236f 100644 --- a/cpp/src/lists/copying/concatenate.cu +++ b/cpp/src/lists/copying/concatenate.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -123,8 +124,8 @@ std::unique_ptr concatenate(host_span columns, // if any of the input columns have nulls, construct the output mask bool const has_nulls = std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); - rmm::device_buffer null_mask = create_null_mask( - total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED); + rmm::device_buffer null_mask = cudf::detail::create_null_mask( + total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr); auto null_mask_data = static_cast(null_mask.data()); auto const null_count = has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0}; diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu index 79d33e7c17d..855ceadf33f 100644 --- a/cpp/src/lists/copying/segmented_gather.cu +++ b/cpp/src/lists/copying/segmented_gather.cu @@ -116,11 +116,11 @@ std::unique_ptr segmented_gather(lists_column_view const& value_column, std::unique_ptr segmented_gather(lists_column_view const& source_column, lists_column_view const& gather_map_list, out_of_bounds_policy bounds_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::segmented_gather( - source_column, gather_map_list, bounds_policy, cudf::get_default_stream(), mr); + return detail::segmented_gather(source_column, gather_map_list, bounds_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index 40a14d805e1..2fd0851067a 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -73,10 +73,11 @@ std::unique_ptr count_elements(lists_column_view const& input, // external APIS std::unique_ptr count_elements(lists_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_elements(input, cudf::get_default_stream(), mr); + return detail::count_elements(input, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu index 5d4a20d1cb8..365e9ef8255 100644 --- a/cpp/src/lists/extract.cu +++ b/cpp/src/lists/extract.cu @@ -196,10 +196,11 @@ std::unique_ptr extract_list_element(lists_column_view lists_column, */ std::unique_ptr extract_list_element(lists_column_view const& lists_column, size_type index, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_list_element(lists_column, index, cudf::get_default_stream(), mr); + return detail::extract_list_element(lists_column, index, stream, mr); } /** @@ -209,12 +210,13 @@ std::unique_ptr extract_list_element(lists_column_view const& lists_colu */ std::unique_ptr extract_list_element(lists_column_view const& lists_column, column_view const& indices, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); CUDF_EXPECTS(indices.size() == lists_column.size(), "Index column must have as many elements as lists column."); - return detail::extract_list_element(lists_column, indices, cudf::get_default_stream(), mr); + return detail::extract_list_element(lists_column, indices, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu index a2af85b5dad..6c00f8b64b4 100644 --- a/cpp/src/lists/reverse.cu +++ b/cpp/src/lists/reverse.cu @@ -86,10 +86,12 @@ std::unique_ptr reverse(lists_column_view const& input, } // namespace detail -std::unique_ptr reverse(lists_column_view const& input, rmm::mr::device_memory_resource* mr) +std::unique_ptr reverse(lists_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::reverse(input, cudf::get_default_stream(), mr); + return detail::reverse(input, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu index 49054ebb046..0b70773f4b2 100644 --- a/cpp/src/lists/segmented_sort.cu +++ b/cpp/src/lists/segmented_sort.cu @@ -119,20 +119,21 @@ std::unique_ptr stable_sort_lists(lists_column_view const& input, std::unique_ptr sort_lists(lists_column_view const& input, order column_order, null_order null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort_lists(input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::sort_lists(input, column_order, null_precedence, stream, mr); } std::unique_ptr stable_sort_lists(lists_column_view const& input, order column_order, null_order null_precedence, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::stable_sort_lists( - input, column_order, null_precedence, cudf::get_default_stream(), mr); + return detail::stable_sort_lists(input, column_order, null_precedence, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu index aaee5608cc3..f92ba782da7 100644 --- a/cpp/src/lists/sequences.cu +++ b/cpp/src/lists/sequences.cu @@ -208,19 +208,21 @@ std::unique_ptr sequences(column_view const& starts, std::unique_ptr sequences(column_view const& starts, column_view const& sizes, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sequences(starts, sizes, cudf::get_default_stream(), mr); + return detail::sequences(starts, sizes, stream, mr); } std::unique_ptr sequences(column_view const& starts, column_view const& steps, column_view const& sizes, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sequences(starts, steps, sizes, cudf::get_default_stream(), mr); + return detail::sequences(starts, steps, sizes, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu index 5687a491363..5647b503cf7 100644 --- a/cpp/src/lists/set_operations.cu +++ b/cpp/src/lists/set_operations.cu @@ -278,42 +278,44 @@ std::unique_ptr have_overlap(lists_column_view const& lhs, lists_column_view const& rhs, null_equality nulls_equal, nan_equality nans_equal, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); + return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, stream, mr); } std::unique_ptr intersect_distinct(lists_column_view const& lhs, lists_column_view const& rhs, null_equality nulls_equal, nan_equality nans_equal, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::intersect_distinct( - lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); + return detail::intersect_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr); } std::unique_ptr union_distinct(lists_column_view const& lhs, lists_column_view const& rhs, null_equality nulls_equal, nan_equality nans_equal, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); + return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr); } std::unique_ptr difference_distinct(lists_column_view const& lhs, lists_column_view const& rhs, null_equality nulls_equal, nan_equality nans_equal, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::difference_distinct( - lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); + return detail::difference_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu index ad43fbd5b00..ce972d89150 100644 --- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu +++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu @@ -101,10 +101,11 @@ std::unique_ptr apply_boolean_mask(lists_column_view const& input, std::unique_ptr apply_boolean_mask(lists_column_view const& input, lists_column_view const& boolean_mask, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr); + return detail::apply_boolean_mask(input, boolean_mask, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index 48d8babb4fa..eb21787b3fa 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -76,10 +76,11 @@ std::unique_ptr distinct(lists_column_view const& input, std::unique_ptr distinct(lists_column_view const& input, null_equality nulls_equal, nan_equality nans_equal, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::distinct(input, nulls_equal, nans_equal, cudf::get_default_stream(), mr); + return detail::distinct(input, nulls_equal, nans_equal, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index c0765b48205..ee29c207cf1 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -13,30 +13,40 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include #include +#include #include -#include +#include #include +#include #include #include #include #include +#include +#include #include #include +#include #include #include #include #include +#include +#include +#include #include -#include #include +#include #include #include #include #include +#include #include #include @@ -45,8 +55,47 @@ namespace cudf { namespace detail { + namespace { +template +struct row_lexicographic_tagged_comparator { + row_lexicographic_tagged_comparator(table_device_view const lhs, + table_device_view const rhs, + device_span const column_order, + device_span const null_precedence) + : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence} + { + } + + __device__ bool operator()(index_type lhs_tagged_index, + index_type rhs_tagged_index) const noexcept + { + auto const [l_side, l_indx] = lhs_tagged_index; + auto const [r_side, r_indx] = rhs_tagged_index; + + table_device_view const* ptr_left_dview{l_side == side::LEFT ? &_lhs : &_rhs}; + table_device_view const* ptr_right_dview{r_side == side::LEFT ? &_lhs : &_rhs}; + auto const comparator = [&]() { + if constexpr (has_nulls) { + return cudf::experimental::row::lexicographic::device_row_comparator{ + has_nulls, *ptr_left_dview, *ptr_right_dview, _column_order, _null_precedence}; + } else { + return cudf::experimental::row::lexicographic::device_row_comparator{ + has_nulls, *ptr_left_dview, *ptr_right_dview, _column_order}; + } + }(); + + return comparator(l_indx, r_indx) == weak_ordering::LESS; + } + + private: + table_device_view const _lhs; + table_device_view const _rhs; + device_span const _null_precedence; + device_span const _column_order; +}; + using detail::side; using index_type = detail::index_type; @@ -187,18 +236,31 @@ index_vector generate_merged_indices(table_view const& left_table, index_vector merged_indices(total_size, stream); + auto const has_nulls = + nullate::DYNAMIC{cudf::has_nulls(left_table) or cudf::has_nulls(right_table)}; + auto lhs_device_view = table_device_view::create(left_table, stream); auto rhs_device_view = table_device_view::create(right_table, stream); auto d_column_order = cudf::detail::make_device_uvector_async( column_order, stream, rmm::mr::get_current_device_resource()); - if (nullable) { + if (has_nulls) { + auto const new_null_precedence = [&]() { + if (null_precedence.size() > 0) { + CUDF_EXPECTS(static_cast(null_precedence.size()) == left_table.num_columns(), + "Null precedence vector size mismatched"); + return null_precedence; + } else { + return std::vector(left_table.num_columns(), null_order::BEFORE); + } + }(); + auto d_null_precedence = cudf::detail::make_device_uvector_async( - null_precedence, stream, rmm::mr::get_current_device_resource()); + new_null_precedence, stream, rmm::mr::get_current_device_resource()); auto ineq_op = detail::row_lexicographic_tagged_comparator( - *lhs_device_view, *rhs_device_view, d_column_order.data(), d_null_precedence.data()); + *lhs_device_view, *rhs_device_view, d_column_order, d_null_precedence); thrust::merge(rmm::exec_policy(stream), left_begin, left_begin + left_size, @@ -208,7 +270,7 @@ index_vector generate_merged_indices(table_view const& left_table, ineq_op); } else { auto ineq_op = detail::row_lexicographic_tagged_comparator( - *lhs_device_view, *rhs_device_view, d_column_order.data()); + *lhs_device_view, *rhs_device_view, d_column_order, {}); thrust::merge(rmm::exec_policy(stream), left_begin, left_begin + left_size, @@ -223,6 +285,56 @@ index_vector generate_merged_indices(table_view const& left_table, return merged_indices; } +index_vector generate_merged_indices_nested(table_view const& left_table, + table_view const& right_table, + std::vector const& column_order, + std::vector const& null_precedence, + bool nullable, + rmm::cuda_stream_view stream) +{ + size_type const left_size = left_table.num_rows(); + size_type const right_size = right_table.num_rows(); + size_type const total_size = left_size + right_size; + + index_vector merged_indices(total_size, stream); + + auto const left_indices_col = cudf::detail::lower_bound(right_table, + left_table, + column_order, + null_precedence, + stream, + rmm::mr::get_current_device_resource()); + auto const left_indices = left_indices_col->view(); + auto left_indices_mutable = left_indices_col->mutable_view(); + auto const left_indices_begin = left_indices.begin(); + auto const left_indices_end = left_indices.end(); + auto left_indices_mutable_begin = left_indices_mutable.begin(); + + auto const total_counter = thrust::make_counting_iterator(0); + thrust::for_each( + rmm::exec_policy_nosync(stream), + total_counter, + total_counter + total_size, + [merged = merged_indices.data(), left = left_indices_begin, left_size, right_size] __device__( + auto const idx) { + // We split threads into two groups, so only one kernel is needed. + // Threads in [0, right_size) will insert right indices in sorted order. + // Threads in [right_size, total_size) will insert left indices in sorted order. + if (idx < right_size) { + // this tells us between which segments of left elements a right element + // would fall + auto const r_bound = thrust::upper_bound(thrust::seq, left, left + left_size, idx); + auto const r_segment = thrust::distance(left, r_bound); + merged[r_segment + idx] = thrust::make_pair(side::RIGHT, idx); + } else { + auto const left_idx = idx - right_size; + merged[left[left_idx] + left_idx] = thrust::make_pair(side::LEFT, left_idx); + } + }); + + return merged_indices; +} + /** * @brief Generate merged column given row-order of merged tables * (ordered according to indices of key_cols) and the 2 columns to merge. @@ -353,6 +465,32 @@ std::unique_ptr column_merger::operator()( return result; } +// specialization for lists +template <> +std::unique_ptr column_merger::operator()( + column_view const& lcol, + column_view const& rcol, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + std::vector columns{lcol, rcol}; + auto concatenated_list = cudf::lists::detail::concatenate(columns, stream, mr); + + auto const iter_gather = cudf::detail::make_counting_transform_iterator( + 0, [row_order = row_order_.data(), lsize = lcol.size()] __device__(auto const idx) { + auto const [side, index] = row_order[idx]; + return side == side::LEFT ? index : lsize + index; + }); + + auto result = cudf::detail::gather(table_view{{concatenated_list->view()}}, + iter_gather, + iter_gather + concatenated_list->size(), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(result->release()[0]); +} + // specialization for structs template <> std::unique_ptr column_merger::operator()( @@ -381,7 +519,7 @@ std::unique_ptr column_merger::operator()( // materialize the output buffer rmm::device_buffer validity = lcol.has_nulls() || rcol.has_nulls() - ? create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr) + ? detail::create_null_mask(merged_size, mask_state::UNINITIALIZED, stream, mr) : rmm::device_buffer{}; if (lcol.has_nulls() || rcol.has_nulls()) { materialize_bitmask(lcol, @@ -418,9 +556,16 @@ table_ptr_type merge(cudf::table_view const& left_table, // extract merged row order according to indices: // - auto const merged_indices = generate_merged_indices( - index_left_view, index_right_view, column_order, null_precedence, nullable, stream); - + auto const merged_indices = [&]() { + if (cudf::detail::has_nested_columns(left_table) or + cudf::detail::has_nested_columns(right_table)) { + return generate_merged_indices_nested( + index_left_view, index_right_view, column_order, null_precedence, nullable, stream); + } else { + return generate_merged_indices( + index_left_view, index_right_view, column_order, null_precedence, nullable, stream); + } + }(); // create merged table: // auto const n_cols = left_table.num_columns(); @@ -493,6 +638,14 @@ table_ptr_type merge(std::vector const& tables_to_merge, CUDF_EXPECTS(key_cols.size() == column_order.size(), "Mismatched size between key_cols and column_order"); + CUDF_EXPECTS( + std::accumulate(tables_to_merge.cbegin(), + tables_to_merge.cend(), + std::size_t{0}, + [](auto const& running_sum, auto const& tbl) { + return running_sum + static_cast(tbl.num_rows()); + }) <= static_cast(std::numeric_limits::max()), + "Total number of merged rows exceeds row limit"); // This utility will ensure all corresponding dictionary columns have matching keys. // It will return any new dictionary columns created as well as updated table_views. diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 9e8b75ae3b6..44a13c450ab 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu index e74fce62caf..91aa1cac487 100644 --- a/cpp/src/reductions/scan/scan_inclusive.cu +++ b/cpp/src/reductions/scan/scan_inclusive.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -25,9 +24,10 @@ #include #include #include +#include +#include #include -#include #include #include @@ -68,43 +68,6 @@ std::pair mask_scan(column_view const& input_view namespace { -/** - * @brief Min/Max inclusive scan operator - * - * This operator will accept index values, check them and then - * run the `Op` operation on the individual element objects. - * The returned result is the appropriate index value. - * - * This was specifically created to workaround a thrust issue - * https://github.com/NVIDIA/thrust/issues/1479 - * where invalid values are passed to the operator. - */ -template -struct min_max_scan_operator { - column_device_view const col; ///< strings column device view - Element const null_replacement{}; ///< value used when element is null - bool const has_nulls; ///< true if col has null elements - - min_max_scan_operator(column_device_view const& col, bool has_nulls = true) - : col{col}, null_replacement{Op::template identity()}, has_nulls{has_nulls} - { - // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash - if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask"); - } - - __device__ inline size_type operator()(size_type lhs, size_type rhs) const - { - // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves; - // in these cases the return value does not matter since the result is not used - if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0; - Element d_lhs = - has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element(lhs); - Element d_rhs = - has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element(rhs); - return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs; - } -}; - template struct scan_functor { static std::unique_ptr invoke(column_view const& input_view, @@ -127,11 +90,6 @@ struct scan_functor { } }; -struct null_iterator { - bitmask_type const* mask; - __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); } -}; - template struct scan_functor { static std::unique_ptr invoke(column_view const& input_view, @@ -139,38 +97,7 @@ struct scan_functor { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto d_input = column_device_view::create(input_view, stream); - - // build indices of the scan operation results - rmm::device_uvector result_map(input_view.size(), stream); - thrust::inclusive_scan( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input_view.size()), - result_map.begin(), - min_max_scan_operator{*d_input, input_view.has_nulls()}); - - if (input_view.has_nulls()) { - // fill the null rows with out-of-bounds values so gather records them as null; - // this prevents un-sanitized null entries in the output - auto null_itr = detail::make_counting_transform_iterator(0, null_iterator{mask}); - auto oob_val = thrust::constant_iterator(input_view.size()); - thrust::scatter_if(rmm::exec_policy(stream), - oob_val, - oob_val + input_view.size(), - thrust::counting_iterator(0), - null_itr, - result_map.data()); - } - - // call gather using the indices to build the output column - auto result_table = cudf::detail::gather(cudf::table_view({input_view}), - result_map, - out_of_bounds_policy::NULLIFY, - negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(result_table->release().front()); + return cudf::strings::detail::scan_inclusive(input_view, mask, stream, mr); } }; @@ -181,38 +108,7 @@ struct scan_functor { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Create a gather map containing indices of the prefix min/max elements. - auto gather_map = rmm::device_uvector(input.size(), stream); - auto const binop_generator = - cudf::reduction::detail::comparison_binop_generator::create(input, stream); - thrust::inclusive_scan(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - gather_map.begin(), - binop_generator.binop()); - - // Gather the children columns of the input column. Must use `get_sliced_child` to properly - // handle input in case it is a sliced view. - auto const input_children = [&] { - auto const it = cudf::detail::make_counting_transform_iterator( - 0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) { - return structs_view.get_sliced_child(child_idx, stream); - }); - return std::vector(it, it + input.num_children()); - }(); - - // Gather the children elements of the prefix min/max struct elements for the output. - auto scanned_children = cudf::detail::gather(table_view{input_children}, - gather_map, - out_of_bounds_policy::DONT_CHECK, - negative_index_policy::NOT_ALLOWED, - stream, - mr) - ->release(); - - // Don't need to set a null mask because that will be handled at the caller. - return make_structs_column( - input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr); + return cudf::structs::detail::scan_inclusive(input, stream, mr); } }; diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 41cce57d55b..8a6367a1f87 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -219,8 +219,12 @@ std::unique_ptr round_with(column_view const& input, if (decimal_places >= 0 && std::is_integral_v) return std::make_unique(input, stream, mr); - auto result = cudf::make_fixed_width_column( - input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(input.type(), + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); T const n = std::pow(10, std::abs(decimal_places)); @@ -256,8 +260,12 @@ std::unique_ptr round_with(column_view const& input, if (input.type().scale() > -decimal_places) return cudf::detail::cast(input, result_type, stream, mr); - auto result = cudf::make_fixed_width_column( - result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(result_type, + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu index 4363bd212fe..b8c7d058535 100644 --- a/cpp/src/search/contains_column.cu +++ b/cpp/src/search/contains_column.cu @@ -14,23 +14,14 @@ * limitations under the License. */ -#include - -#include #include #include #include #include #include #include -#include #include -#include - -#include -#include -#include namespace cudf { namespace detail { @@ -38,61 +29,7 @@ namespace detail { namespace { struct contains_column_dispatch { - template - struct contains_fn { - bool __device__ operator()(size_type const idx) const - { - if (needles_have_nulls && needles.is_null_nocheck(idx)) { - // Exit early. The value doesn't matter, and will be masked as a null element. - return true; - } - - return haystack.contains(needles.template element(idx)); - } - - Haystack const haystack; - column_device_view const needles; - bool const needles_have_nulls; - }; - - template ())> - std::unique_ptr operator()(column_view const& haystack, - column_view const& needles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - auto result = make_numeric_column(data_type{type_to_id()}, - needles.size(), - copy_bitmask(needles, stream, mr), - needles.null_count(), - stream, - mr); - if (needles.is_empty()) { return result; } - - auto const out_begin = result->mutable_view().template begin(); - if (haystack.is_empty()) { - thrust::uninitialized_fill( - rmm::exec_policy(stream), out_begin, out_begin + needles.size(), false); - return result; - } - - auto const haystack_set = cudf::detail::unordered_multiset::create(haystack, stream); - auto const haystack_set_dv = haystack_set.to_device(); - auto const needles_cdv_ptr = column_device_view::create(needles, stream); - - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(needles.size()), - out_begin, - contains_fn{ - haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()}); - - result->set_null_count(needles.null_count()); - - return result; - } - - template ())> + template std::unique_ptr operator()(column_view const& haystack, column_view const& needles, rmm::cuda_stream_view stream, @@ -105,7 +42,7 @@ struct contains_column_dispatch { stream, mr); return std::make_unique( - std::move(result_v), copy_bitmask(needles, stream, mr), needles.null_count()); + std::move(result_v), detail::copy_bitmask(needles, stream, mr), needles.null_count()); } }; @@ -144,8 +81,6 @@ std::unique_ptr contains(column_view const& haystack, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch"); - return cudf::type_dispatcher( haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr); } diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 0c0ad0ad29e..35b0c0a2690 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -214,25 +214,26 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str // external API -std::unique_ptr all_characters_of_type(strings_column_view const& strings, +std::unique_ptr all_characters_of_type(strings_column_view const& input, string_character_types types, string_character_types verify_types, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::all_characters_of_type( - strings, types, verify_types, cudf::get_default_stream(), mr); + return detail::all_characters_of_type(input, types, verify_types, stream, mr); } -std::unique_ptr filter_characters_of_type(strings_column_view const& strings, +std::unique_ptr filter_characters_of_type(strings_column_view const& input, string_character_types types_to_remove, string_scalar const& replacement, string_character_types types_to_keep, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::filter_characters_of_type( - strings, types_to_remove, replacement, types_to_keep, cudf::get_default_stream(), mr); + input, types_to_remove, replacement, types_to_keep, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index ba8acd23467..0a11b6dc460 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -267,11 +267,11 @@ std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate( - strings_columns, separator, narep, separate_nulls, cudf::get_default_stream(), mr); + return detail::concatenate(strings_columns, separator, narep, separate_nulls, stream, mr); } std::unique_ptr concatenate(table_view const& strings_columns, @@ -279,16 +279,12 @@ std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator_narep, string_scalar const& col_narep, separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, - separators, - separator_narep, - col_narep, - separate_nulls, - cudf::get_default_stream(), - mr); + return detail::concatenate( + strings_columns, separators, separator_narep, col_narep, separate_nulls, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index faf1be6a26f..9ab527feaf8 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -180,10 +180,11 @@ std::unique_ptr join_strings(strings_column_view const& input, std::unique_ptr join_strings(strings_column_view const& strings, string_scalar const& separator, string_scalar const& narep, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::join_strings(strings, separator, narep, cudf::get_default_stream(), mr); + return detail::join_strings(strings, separator, narep, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index eee59e37478..372b49fb0ee 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -301,16 +301,12 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string string_scalar const& narep, separator_on_nulls separate_nulls, output_if_empty_list empty_list_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::join_list_elements(lists_strings_column, - separator, - narep, - separate_nulls, - empty_list_policy, - cudf::get_default_stream(), - mr); + return detail::join_list_elements( + lists_strings_column, separator, narep, separate_nulls, empty_list_policy, stream, mr); } std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, @@ -319,6 +315,7 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string string_scalar const& string_narep, separator_on_nulls separate_nulls, output_if_empty_list empty_list_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -328,7 +325,7 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string string_narep, separate_nulls, empty_list_policy, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index 22534870409..4383f358a33 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -123,28 +123,31 @@ std::unique_ptr count_re(strings_column_view const& input, // external APIs -std::unique_ptr contains_re(strings_column_view const& strings, +std::unique_ptr contains_re(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_re(strings, prog, cudf::get_default_stream(), mr); + return detail::contains_re(input, prog, stream, mr); } -std::unique_ptr matches_re(strings_column_view const& strings, +std::unique_ptr matches_re(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::matches_re(strings, prog, cudf::get_default_stream(), mr); + return detail::matches_re(input, prog, stream, mr); } -std::unique_ptr count_re(strings_column_view const& strings, +std::unique_ptr count_re(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_re(strings, prog, cudf::get_default_stream(), mr); + return detail::count_re(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu index 0d04fc74b0c..8196e1d90fb 100644 --- a/cpp/src/strings/convert/convert_booleans.cu +++ b/cpp/src/strings/convert/convert_booleans.cu @@ -39,25 +39,25 @@ namespace cudf { namespace strings { namespace detail { // Convert strings column to boolean column -std::unique_ptr to_booleans(strings_column_view const& strings, +std::unique_ptr to_booleans(strings_column_view const& input, string_scalar const& true_string, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); + size_type strings_count = input.size(); if (strings_count == 0) return make_numeric_column(data_type{type_id::BOOL8}, 0); CUDF_EXPECTS(true_string.is_valid(stream) && true_string.size() > 0, "Parameter true_string must not be empty."); auto d_true = string_view(true_string.data(), true_string.size()); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; // create output column copying the strings' null-mask auto results = make_numeric_column(data_type{type_id::BOOL8}, strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto results_view = results->mutable_view(); @@ -73,19 +73,20 @@ std::unique_ptr to_booleans(strings_column_view const& strings, result = d_strings.element(idx).compare(d_true) == 0; return result; }); - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } } // namespace detail // external API -std::unique_ptr to_booleans(strings_column_view const& strings, +std::unique_ptr to_booleans(strings_column_view const& input, string_scalar const& true_string, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_booleans(strings, true_string, cudf::get_default_stream(), mr); + return detail::to_booleans(input, true_string, stream, mr); } namespace detail { @@ -156,10 +157,11 @@ std::unique_ptr from_booleans(column_view const& booleans, std::unique_ptr from_booleans(column_view const& booleans, string_scalar const& true_string, string_scalar const& false_string, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_booleans(booleans, true_string, false_string, cudf::get_default_stream(), mr); + return detail::from_booleans(booleans, true_string, false_string, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 8a953d778ed..d2609441d72 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -710,18 +710,20 @@ std::unique_ptr is_timestamp(strings_column_view const& input, std::unique_ptr to_timestamps(strings_column_view const& input, data_type timestamp_type, std::string_view format, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_timestamps(input, timestamp_type, format, cudf::get_default_stream(), mr); + return detail::to_timestamps(input, timestamp_type, format, stream, mr); } std::unique_ptr is_timestamp(strings_column_view const& input, std::string_view format, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_timestamp(input, format, cudf::get_default_stream(), mr); + return detail::is_timestamp(input, format, stream, mr); } namespace detail { @@ -1168,10 +1170,11 @@ std::unique_ptr from_timestamps(column_view const& timestamps, std::unique_ptr from_timestamps(column_view const& timestamps, std::string_view format, strings_column_view const& names, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_timestamps(timestamps, format, names, cudf::get_default_stream(), mr); + return detail::from_timestamps(timestamps, format, names, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 6ab70825a6b..e781581b378 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -690,30 +690,30 @@ std::unique_ptr from_durations(column_view const& durations, durations.type(), dispatch_from_durations_fn{}, durations, format, stream, mr); } -std::unique_ptr to_durations(strings_column_view const& strings, +std::unique_ptr to_durations(strings_column_view const& input, data_type duration_type, std::string_view format, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); + size_type strings_count = input.size(); if (strings_count == 0) return make_duration_column(duration_type, 0); CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty."); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_column = *strings_column; auto results = make_duration_column(duration_type, strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto results_view = results->mutable_view(); cudf::type_dispatcher( duration_type, dispatch_to_durations_fn(), d_column, format, results_view, stream); - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } @@ -721,19 +721,21 @@ std::unique_ptr to_durations(strings_column_view const& strings, std::unique_ptr from_durations(column_view const& durations, std::string_view format, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_durations(durations, format, cudf::get_default_stream(), mr); + return detail::from_durations(durations, format, stream, mr); } -std::unique_ptr to_durations(strings_column_view const& strings, +std::unique_ptr to_durations(strings_column_view const& input, data_type duration_type, std::string_view format, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_durations(strings, duration_type, format, cudf::get_default_stream(), mr); + return detail::to_durations(input, duration_type, format, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 51aab9faeba..2c59f6dcd29 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -184,12 +184,13 @@ std::unique_ptr to_fixed_point(strings_column_view const& input, } // namespace detail // external API -std::unique_ptr to_fixed_point(strings_column_view const& strings, +std::unique_ptr to_fixed_point(strings_column_view const& input, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_fixed_point(strings, output_type, cudf::get_default_stream(), mr); + return detail::to_fixed_point(input, output_type, stream, mr); } namespace detail { @@ -277,10 +278,11 @@ std::unique_ptr from_fixed_point(column_view const& input, // external API std::unique_ptr from_fixed_point(column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_fixed_point(input, cudf::get_default_stream(), mr); + return detail::from_fixed_point(input, stream, mr); } namespace detail { @@ -341,10 +343,11 @@ std::unique_ptr is_fixed_point(strings_column_view const& input, std::unique_ptr is_fixed_point(strings_column_view const& input, data_type decimal_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_fixed_point(input, decimal_type, cudf::get_default_stream(), mr); + return detail::is_fixed_point(input, decimal_type, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 32167589ab4..81d686d690c 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -91,26 +91,26 @@ struct dispatch_to_floats_fn { } // namespace // This will convert a strings column into any float column type. -std::unique_ptr to_floats(strings_column_view const& strings, +std::unique_ptr to_floats(strings_column_view const& input, data_type output_type, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); + size_type strings_count = input.size(); if (strings_count == 0) return make_numeric_column(output_type, 0); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_strings = *strings_column; // create float output column copying the strings null-mask auto results = make_numeric_column(output_type, strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto results_view = results->mutable_view(); // fill output column with floats type_dispatcher(output_type, dispatch_to_floats_fn{}, d_strings, results_view, stream); - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } @@ -118,12 +118,13 @@ std::unique_ptr to_floats(strings_column_view const& strings, // external API -std::unique_ptr to_floats(strings_column_view const& strings, +std::unique_ptr to_floats(strings_column_view const& input, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_floats(strings, output_type, cudf::get_default_stream(), mr); + return detail::to_floats(input, output_type, stream, mr); } namespace detail { @@ -436,48 +437,51 @@ std::unique_ptr from_floats(column_view const& floats, } // namespace detail // external API -std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr) +std::unique_ptr from_floats(column_view const& floats, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_floats(floats, cudf::get_default_stream(), mr); + return detail::from_floats(floats, stream, mr); } namespace detail { -std::unique_ptr is_float(strings_column_view const& strings, +std::unique_ptr is_float(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_column = *strings_column; // create output column auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto d_results = results->mutable_view().data(); // check strings for valid float chars thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), + thrust::make_counting_iterator(input.size()), d_results, [d_column] __device__(size_type idx) { if (d_column.is_null(idx)) return false; return is_float(d_column.element(idx)); }); - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } } // namespace detail // external API -std::unique_ptr is_float(strings_column_view const& strings, +std::unique_ptr is_float(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_float(strings, cudf::get_default_stream(), mr); + return detail::is_float(input, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index bed682aba71..8f656b149a5 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -93,7 +93,8 @@ struct hex_to_integer_fn { * The output_column is expected to be one of the integer types only. */ struct dispatch_hex_to_integers_fn { - template >* = nullptr> + template ()>* = nullptr> void operator()(column_device_view const& strings_column, mutable_column_view& output_column, rmm::cuda_stream_view stream) const @@ -105,22 +106,14 @@ struct dispatch_hex_to_integers_fn { d_results, hex_to_integer_fn{strings_column}); } - // non-integral types throw an exception + // non-integer types throw an exception template - std::enable_if_t, void> operator()(Args&&...) const + std::enable_if_t(), void> operator()(Args&&...) const { - CUDF_FAIL("Output for hex_to_integers must be an integral type."); + CUDF_FAIL("Output for hex_to_integers must be an integer type."); } }; -template <> -void dispatch_hex_to_integers_fn::operator()(column_device_view const&, - mutable_column_view&, - rmm::cuda_stream_view) const -{ - CUDF_FAIL("Output for hex_to_integers must not be a boolean type."); -} - /** * @brief Functor to convert integers to hexadecimal strings * @@ -179,7 +172,8 @@ struct integer_to_hex_fn { }; struct dispatch_integers_to_hex_fn { - template >* = nullptr> + template ()>* = nullptr> std::unique_ptr operator()(column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const @@ -195,11 +189,12 @@ struct dispatch_integers_to_hex_fn { input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } - // non-integral types throw an exception + // non-integer types throw an exception template - std::enable_if_t, std::unique_ptr> operator()(Args...) const + std::enable_if_t(), std::unique_ptr> operator()( + Args...) const { - CUDF_FAIL("integers_to_hex only supports integral type columns"); + CUDF_FAIL("integers_to_hex only supports integer type columns"); } }; @@ -280,24 +275,27 @@ std::unique_ptr integers_to_hex(column_view const& input, // external API std::unique_ptr hex_to_integers(strings_column_view const& strings, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hex_to_integers(strings, output_type, cudf::get_default_stream(), mr); + return detail::hex_to_integers(strings, output_type, stream, mr); } std::unique_ptr is_hex(strings_column_view const& strings, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_hex(strings, cudf::get_default_stream(), mr); + return detail::is_hex(strings, stream, mr); } std::unique_ptr integers_to_hex(column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::integers_to_hex(input, cudf::get_default_stream(), mr); + return detail::integers_to_hex(input, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 5597d2831c0..4839e83d5dd 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -111,21 +111,21 @@ inline __device__ bool is_integer(string_view const& d_str) * @brief The dispatch functions for checking if strings are valid integers. */ struct dispatch_is_integer_fn { - template >* = nullptr> - std::unique_ptr operator()(strings_column_view const& strings, + template ()>* = nullptr> + std::unique_ptr operator()(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const { - auto const d_column = column_device_view::create(strings.parent(), stream); + auto const d_column = column_device_view::create(input.parent(), stream); auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto d_results = results->mutable_view().data(); - if (strings.has_nulls()) { + if (input.has_nulls()) { thrust::transform(rmm::exec_policy(stream), d_column->pair_begin(), d_column->pair_end(), @@ -140,12 +140,12 @@ struct dispatch_is_integer_fn { } // Calling mutable_view() on a column invalidates it's null count so we need to set it back - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } - template >* = nullptr> + template ()>* = nullptr> std::unique_ptr operator()(strings_column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const @@ -156,20 +156,20 @@ struct dispatch_is_integer_fn { } // namespace -std::unique_ptr is_integer(strings_column_view const& strings, +std::unique_ptr is_integer(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const d_column = column_device_view::create(strings.parent(), stream); + auto const d_column = column_device_view::create(input.parent(), stream); auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto d_results = results->mutable_view().data(); - if (strings.has_nulls()) { + if (input.has_nulls()) { thrust::transform( rmm::exec_policy(stream), d_column->pair_begin(), @@ -185,36 +185,38 @@ std::unique_ptr is_integer(strings_column_view const& strings, } // Calling mutable_view() on a column invalidates it's null count so we need to set it back - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } -std::unique_ptr is_integer(strings_column_view const& strings, +std::unique_ptr is_integer(strings_column_view const& input, data_type int_type, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (strings.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); } - return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr); + if (input.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); } + return type_dispatcher(int_type, dispatch_is_integer_fn{}, input, stream, mr); } } // namespace detail // external APIs -std::unique_ptr is_integer(strings_column_view const& strings, +std::unique_ptr is_integer(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_integer(strings, cudf::get_default_stream(), mr); + return detail::is_integer(input, stream, mr); } -std::unique_ptr is_integer(strings_column_view const& strings, +std::unique_ptr is_integer(strings_column_view const& input, data_type int_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_integer(strings, int_type, cudf::get_default_stream(), mr); + return detail::is_integer(input, int_type, stream, mr); } namespace detail { @@ -243,7 +245,8 @@ struct string_to_integer_fn { * The output_column is expected to be one of the integer types only. */ struct dispatch_to_integers_fn { - template >* = nullptr> + template ()>* = nullptr> void operator()(column_device_view const& strings_column, mutable_column_view& output_column, rmm::cuda_stream_view stream) const @@ -254,47 +257,39 @@ struct dispatch_to_integers_fn { output_column.data(), string_to_integer_fn{strings_column}); } - // non-integral types throw an exception - template >* = nullptr> + // non-integer types throw an exception + template ()>* = nullptr> void operator()(column_device_view const&, mutable_column_view&, rmm::cuda_stream_view) const { - CUDF_FAIL("Output for to_integers must be an integral type."); + CUDF_FAIL("Output for to_integers must be an integer type."); } }; -template <> -void dispatch_to_integers_fn::operator()(column_device_view const&, - mutable_column_view&, - rmm::cuda_stream_view) const -{ - CUDF_FAIL("Output for to_integers must not be a boolean type."); -} - } // namespace // This will convert a strings column into any integer column type. -std::unique_ptr to_integers(strings_column_view const& strings, +std::unique_ptr to_integers(strings_column_view const& input, data_type output_type, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); + size_type strings_count = input.size(); if (strings_count == 0) return make_numeric_column(output_type, 0); // Create integer output column copying the strings null-mask auto results = make_numeric_column(output_type, strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); // Fill output column with integers - auto const strings_dev_view = column_device_view::create(strings.parent(), stream); + auto const strings_dev_view = column_device_view::create(input.parent(), stream); auto results_view = results->mutable_view(); type_dispatcher(output_type, dispatch_to_integers_fn{}, *strings_dev_view, results_view, stream); // Calling mutable_view() on a column invalidates it's null count so we need to set it back - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } @@ -302,12 +297,13 @@ std::unique_ptr to_integers(strings_column_view const& strings, } // namespace detail // external API -std::unique_ptr to_integers(strings_column_view const& strings, +std::unique_ptr to_integers(strings_column_view const& input, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_integers(strings, output_type, cudf::get_default_stream(), mr); + return detail::to_integers(input, output_type, stream, mr); } namespace detail { @@ -351,7 +347,8 @@ struct from_integers_fn { * The template function declaration ensures only integer types are used. */ struct dispatch_from_integers_fn { - template >* = nullptr> + template ()>* = nullptr> std::unique_ptr operator()(column_view const& integers, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const @@ -373,23 +370,15 @@ struct dispatch_from_integers_fn { std::move(null_mask)); } - // non-integral types throw an exception - template >* = nullptr> + // non-integer types throw an exception + template ()>* = nullptr> std::unique_ptr operator()(column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const { - CUDF_FAIL("Values for from_integers function must be an integral type."); + CUDF_FAIL("Values for from_integers function must be an integer type."); } }; - -template <> -std::unique_ptr dispatch_from_integers_fn::operator()( - column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const -{ - CUDF_FAIL("Input for from_integers must not be a boolean type."); -} - } // namespace // This will convert all integer column types into a strings column. @@ -407,10 +396,11 @@ std::unique_ptr from_integers(column_view const& integers, // external API std::unique_ptr from_integers(column_view const& integers, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_integers(integers, cudf::get_default_stream(), mr); + return detail::from_integers(integers, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index adb72cb0263..07e4b3e5b17 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -72,19 +72,19 @@ struct ipv4_to_integers_fn { } // namespace // Convert strings column of IPv4 addresses to integers column -std::unique_ptr ipv4_to_integers(strings_column_view const& strings, +std::unique_ptr ipv4_to_integers(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - size_type strings_count = strings.size(); + size_type strings_count = input.size(); if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0); - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); // create output column copying the strings' null-mask auto results = make_numeric_column(data_type{type_id::INT64}, strings_count, - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto d_results = results->mutable_view().data(); @@ -95,18 +95,19 @@ std::unique_ptr ipv4_to_integers(strings_column_view const& strings, d_results, ipv4_to_integers_fn{*strings_column}); // done - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } } // namespace detail // external API -std::unique_ptr ipv4_to_integers(strings_column_view const& strings, +std::unique_ptr ipv4_to_integers(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ipv4_to_integers(strings, cudf::get_default_stream(), mr); + return detail::ipv4_to_integers(input, stream, mr); } namespace detail { @@ -173,23 +174,23 @@ std::unique_ptr integers_to_ipv4(column_view const& integers, cudf::detail::copy_bitmask(integers, stream, mr)); } -std::unique_ptr is_ipv4(strings_column_view const& strings, +std::unique_ptr is_ipv4(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_column = column_device_view::create(strings.parent(), stream); + auto strings_column = column_device_view::create(input.parent(), stream); auto d_column = *strings_column; // create output column auto results = make_numeric_column(data_type{type_id::BOOL8}, - strings.size(), - cudf::detail::copy_bitmask(strings.parent(), stream, mr), - strings.null_count(), + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), stream, mr); auto d_results = results->mutable_view().data(); thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings.size()), + thrust::make_counting_iterator(input.size()), d_results, [d_column] __device__(size_type idx) { if (d_column.is_null(idx)) return false; @@ -214,7 +215,7 @@ std::unique_ptr is_ipv4(strings_column_view const& strings, return ip_vals[0] >= 0 && ip_vals[1] >= 0 && ip_vals[2] >= 0 && ip_vals[3] >= 0; }); - results->set_null_count(strings.null_count()); + results->set_null_count(input.null_count()); return results; } @@ -223,17 +224,19 @@ std::unique_ptr is_ipv4(strings_column_view const& strings, // external API std::unique_ptr integers_to_ipv4(column_view const& integers, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::integers_to_ipv4(integers, cudf::get_default_stream(), mr); + return detail::integers_to_ipv4(integers, stream, mr); } -std::unique_ptr is_ipv4(strings_column_view const& strings, +std::unique_ptr is_ipv4(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_ipv4(strings, cudf::get_default_stream(), mr); + return detail::is_ipv4(input, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu index 3aef37914fd..f9f2b91eb12 100644 --- a/cpp/src/strings/convert/convert_lists.cu +++ b/cpp/src/strings/convert/convert_lists.cu @@ -233,10 +233,11 @@ std::unique_ptr format_list_column(lists_column_view const& input, std::unique_ptr format_list_column(lists_column_view const& input, string_scalar const& na_rep, strings_column_view const& separators, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::format_list_column(input, na_rep, separators, cudf::get_default_stream(), mr); + return detail::format_list_column(input, na_rep, separators, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 9efa148cfd2..511acc38d75 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -148,11 +148,12 @@ std::unique_ptr url_encode(strings_column_view const& input, } // namespace detail // external API -std::unique_ptr url_encode(strings_column_view const& strings, +std::unique_ptr url_encode(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::url_encode(strings, cudf::get_default_stream(), mr); + return detail::url_encode(input, stream, mr); } namespace detail { @@ -211,7 +212,8 @@ __global__ void url_decode_char_counter(column_device_view const in_strings, char* in_chars_shared = temporary_buffer[local_warp_id]; // Loop through strings, and assign each string to a warp. - for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) { + for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) { + auto const row_idx = static_cast(tidx); if (in_strings.is_null(row_idx)) { out_counts[row_idx] = 0; continue; @@ -295,7 +297,8 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings, char* in_chars_shared = temporary_buffer[local_warp_id]; // Loop through strings, and assign each string to a warp - for (size_type row_idx = global_warp_id; row_idx < in_strings.size(); row_idx += nwarps) { + for (thread_index_type tidx = global_warp_id; tidx < in_strings.size(); tidx += nwarps) { + auto const row_idx = static_cast(tidx); if (in_strings.is_null(row_idx)) continue; auto const in_string = in_strings.element(row_idx); @@ -428,11 +431,12 @@ std::unique_ptr url_decode(strings_column_view const& strings, // external API -std::unique_ptr url_decode(strings_column_view const& strings, +std::unique_ptr url_decode(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::url_decode(strings, cudf::get_default_stream(), mr); + return detail::url_decode(input, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index 532053e750e..8edcd167e5c 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -131,12 +131,13 @@ std::unique_ptr
extract(strings_column_view const& input, // external API -std::unique_ptr
extract(strings_column_view const& strings, +std::unique_ptr
extract(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract(strings, prog, cudf::get_default_stream(), mr); + return detail::extract(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index 8a2f8f0cbfc..0c0d4ae4fbf 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -164,12 +164,13 @@ std::unique_ptr extract_all_record(strings_column_view const& input, // external API -std::unique_ptr extract_all_record(strings_column_view const& strings, +std::unique_ptr extract_all_record(strings_column_view const& input, regex_program const& prog, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_all_record(strings, prog, cudf::get_default_stream(), mr); + return detail::extract_all_record(input, prog, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 3e38b5fa775..9f95fedfe0b 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -154,15 +154,16 @@ std::unique_ptr filter_characters( * @copydoc cudf::strings::filter_characters */ std::unique_ptr filter_characters( - strings_column_view const& strings, + strings_column_view const& input, std::vector> characters_to_filter, filter_type keep_characters, string_scalar const& replacement, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); return detail::filter_characters( - strings, characters_to_filter, keep_characters, replacement, cudf::get_default_stream(), mr); + input, characters_to_filter, keep_characters, replacement, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu index 5b91f295efb..93e00592ef2 100644 --- a/cpp/src/strings/like.cu +++ b/cpp/src/strings/like.cu @@ -185,19 +185,21 @@ std::unique_ptr like(strings_column_view const& input, std::unique_ptr like(strings_column_view const& input, string_scalar const& pattern, string_scalar const& escape_character, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::like(input, pattern, escape_character, cudf::get_default_stream(), mr); + return detail::like(input, pattern, escape_character, stream, mr); } std::unique_ptr like(strings_column_view const& input, strings_column_view const& patterns, string_scalar const& escape_character, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::like(input, patterns, escape_character, cudf::get_default_stream(), mr); + return detail::like(input, patterns, escape_character, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index c501a8bf7b4..850ccaa4535 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -168,18 +168,20 @@ std::unique_ptr pad(strings_column_view const& input, size_type width, side_type side, std::string_view fill_char, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::pad(input, width, side, fill_char, cudf::get_default_stream(), mr); + return detail::pad(input, width, side, fill_char, stream, mr); } std::unique_ptr zfill(strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::zfill(input, width, cudf::get_default_stream(), mr); + return detail::zfill(input, width, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 396e1e6a2ac..847a64f5602 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -67,7 +67,7 @@ std::unique_ptr repeat_string(string_scalar const& input, return in_ptr[idx % str_size]; }); - return std::make_unique(std::move(buff)); + return std::make_unique(std::move(buff), true, stream, mr); } namespace { @@ -260,26 +260,29 @@ std::unique_ptr repeat_strings(strings_column_view const& input, std::unique_ptr repeat_string(string_scalar const& input, size_type repeat_times, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat_string(input, repeat_times, cudf::get_default_stream(), mr); + return detail::repeat_string(input, repeat_times, stream, mr); } std::unique_ptr repeat_strings(strings_column_view const& input, size_type repeat_times, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr); + return detail::repeat_strings(input, repeat_times, stream, mr); } std::unique_ptr repeat_strings(strings_column_view const& input, column_view const& repeat_times, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr); + return detail::repeat_strings(input, repeat_times, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 31e06aac72b..74f38cbcc20 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -148,10 +148,11 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, std::unique_ptr replace_with_backrefs(strings_column_view const& strings, regex_program const& prog, std::string_view replacement, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_with_backrefs(strings, prog, replacement, cudf::get_default_stream(), mr); + return detail::replace_with_backrefs(strings, prog, replacement, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 92ace4e7bc7..f80ace57c69 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -383,7 +383,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in std::move(offsets), std::move(chars->release().children.back()), input.null_count(), - copy_bitmask(input.parent(), stream, mr)); + cudf::detail::copy_bitmask(input.parent(), stream, mr)); } /** @@ -490,10 +490,11 @@ std::unique_ptr replace(strings_column_view const& input, std::unique_ptr replace(strings_column_view const& strings, strings_column_view const& targets, strings_column_view const& repls, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr); + return detail::replace(strings, targets, repls, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 867b443c036..3375cb7a789 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -206,10 +206,11 @@ std::unique_ptr replace_re(strings_column_view const& strings, std::vector const& patterns, strings_column_view const& replacements, regex_flags const flags, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_re(strings, patterns, replacements, flags, cudf::get_default_stream(), mr); + return detail::replace_re(strings, patterns, replacements, flags, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index a622d1a742d..a6a14f27dec 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -97,7 +97,7 @@ struct replace_row_parallel_fn { } else { bytes += d_repl.size_bytes() - d_target.size_bytes(); } - position = d_str.find(d_target, position + d_target.size_bytes()); + position = d_str.find(d_target, position + d_target.length()); --max_n; } if (out_ptr) // copy whats left (or right depending on your point of view) @@ -751,21 +751,23 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, std::unique_ptr replace(strings_column_view const& strings, string_scalar const& target, string_scalar const& repl, - int32_t maxrepl, + cudf::size_type maxrepl, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace(strings, target, repl, maxrepl, cudf::get_default_stream(), mr); + return detail::replace(strings, target, repl, maxrepl, stream, mr); } std::unique_ptr replace_slice(strings_column_view const& strings, string_scalar const& repl, size_type start, size_type stop, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_slice(strings, repl, start, stop, cudf::get_default_stream(), mr); + return detail::replace_slice(strings, repl, start, stop, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 81ddb937be5..502d5f1a52e 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -134,11 +134,11 @@ std::unique_ptr replace_re(strings_column_view const& strings, regex_program const& prog, string_scalar const& replacement, std::optional max_replace_count, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_re( - strings, prog, replacement, max_replace_count, cudf::get_default_stream(), mr); + return detail::replace_re(strings, prog, replacement, max_replace_count, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu index 090705ac25d..2855bdbb827 100644 --- a/cpp/src/strings/reverse.cu +++ b/cpp/src/strings/reverse.cu @@ -79,10 +79,11 @@ std::unique_ptr reverse(strings_column_view const& input, } // namespace detail std::unique_ptr reverse(strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::reverse(input, cudf::get_default_stream(), mr); + return detail::reverse(input, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu new file mode 100644 index 00000000000..0cf492fa295 --- /dev/null +++ b/cpp/src/strings/scan/scan_inclusive.cu @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace detail { +namespace { + +/** + * @brief Min/Max inclusive scan operator + * + * This operator will accept index values, check them and then + * run the `Op` operation on the individual element objects. + * The returned result is the appropriate index value. + * + * This was specifically created to workaround a thrust issue + * https://github.com/NVIDIA/thrust/issues/1479 + * where invalid values are passed to the operator. + */ +template +struct min_max_scan_operator { + column_device_view const col; ///< strings column device view + Element const null_replacement{}; ///< value used when element is null + bool const has_nulls; ///< true if col has null elements + + min_max_scan_operator(column_device_view const& col, bool has_nulls = true) + : col{col}, null_replacement{Op::template identity()}, has_nulls{has_nulls} + { + // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash + if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask"); + } + + __device__ inline size_type operator()(size_type lhs, size_type rhs) const + { + // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves; + // in these cases the return value does not matter since the result is not used + if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0; + Element d_lhs = + has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element(lhs); + Element d_rhs = + has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element(rhs); + return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs; + } +}; + +struct null_iterator { + bitmask_type const* mask; + __device__ bool operator()(size_type idx) const { return !bit_is_set(mask, idx); } +}; + +} // namespace + +template +std::unique_ptr scan_inclusive(column_view const& input, + bitmask_type const* mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto d_input = column_device_view::create(input, stream); + + // build indices of the scan operation results + rmm::device_uvector result_map(input.size(), stream); + thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + result_map.begin(), + min_max_scan_operator{*d_input, input.has_nulls()}); + + if (input.has_nulls()) { + // fill the null rows with out-of-bounds values so gather records them as null; + // this prevents un-sanitized null entries in the output + auto null_itr = cudf::detail::make_counting_transform_iterator(0, null_iterator{mask}); + auto oob_val = thrust::constant_iterator(input.size()); + thrust::scatter_if(rmm::exec_policy(stream), + oob_val, + oob_val + input.size(), + thrust::counting_iterator(0), + null_itr, + result_map.data()); + } + + // call gather using the indices to build the output column + auto result_table = cudf::detail::gather(cudf::table_view({input}), + result_map, + cudf::out_of_bounds_policy::NULLIFY, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(result_table->release().front()); +} + +template std::unique_ptr scan_inclusive(column_view const& input, + bitmask_type const* mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template std::unique_ptr scan_inclusive(column_view const& input, + bitmask_type const* mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index cce6a19a5a6..5a1fee92c7d 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -248,20 +248,21 @@ std::unique_ptr slice_strings(strings_column_view const& strings, numeric_scalar const& start, numeric_scalar const& stop, numeric_scalar const& step, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr); + return detail::slice_strings(strings, start, stop, step, stream, mr); } std::unique_ptr slice_strings(strings_column_view const& strings, column_view const& starts_column, column_view const& stops_column, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings( - strings, starts_column, stops_column, cudf::get_default_stream(), mr); + return detail::slice_strings(strings, starts_column, stops_column, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu index 0c7d119ea38..16e6402cfef 100644 --- a/cpp/src/strings/split/partition.cu +++ b/cpp/src/strings/split/partition.cu @@ -239,20 +239,22 @@ std::unique_ptr
rpartition(strings_column_view const& strings, // external APIs -std::unique_ptr
partition(strings_column_view const& strings, +std::unique_ptr
partition(strings_column_view const& input, string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::partition(strings, delimiter, cudf::get_default_stream(), mr); + return detail::partition(input, delimiter, stream, mr); } -std::unique_ptr
rpartition(strings_column_view const& strings, +std::unique_ptr
rpartition(strings_column_view const& input, string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr); + return detail::rpartition(input, delimiter, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 3be5937297f..045aac279e6 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -290,7 +290,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, std::move(offsets), std::move(strings_output), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -340,37 +340,41 @@ std::unique_ptr rsplit_record_re(strings_column_view const& input, std::unique_ptr
split_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::split_re(input, prog, maxsplit, stream, mr); } std::unique_ptr split_record_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::split_record_re(input, prog, maxsplit, stream, mr); } std::unique_ptr
rsplit_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::rsplit_re(input, prog, maxsplit, stream, mr); } std::unique_ptr rsplit_record_re(strings_column_view const& input, regex_program const& prog, size_type maxsplit, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr); + return detail::rsplit_record_re(input, prog, maxsplit, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 52f27c68111..7a0cfb9ef41 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -57,7 +57,7 @@ std::unique_ptr split_record_fn(strings_column_view const& input, std::move(offsets), std::move(results), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -72,7 +72,7 @@ std::unique_ptr split_record_fn(strings_column_view const& input, std::move(offsets), std::move(strings_child), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -160,7 +160,7 @@ std::unique_ptr whitespace_split_record_fn(strings_column_view const& in std::move(offsets), std::move(strings_output), input.null_count(), - copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu index 6fb7c671a87..26df76850f7 100644 --- a/cpp/src/strings/strip.cu +++ b/cpp/src/strings/strip.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,10 +86,11 @@ std::unique_ptr strip(strings_column_view const& input, std::unique_ptr strip(strings_column_view const& input, side_type side, string_scalar const& to_strip, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::strip(input, side, to_strip, cudf::get_default_stream(), mr); + return detail::strip(input, side, to_strip, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index e7b637c52f3..0ca5e103d3d 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -124,12 +124,13 @@ std::unique_ptr translate(strings_column_view const& strings, // external APIs -std::unique_ptr translate(strings_column_view const& strings, +std::unique_ptr translate(strings_column_view const& input, std::vector> const& chars_table, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::translate(strings, chars_table, cudf::get_default_stream(), mr); + return detail::translate(input, chars_table, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index 335908d65d1..aa87a663964 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,10 +19,9 @@ #include #include #include -#include -#include #include #include +#include #include #include @@ -133,10 +132,11 @@ std::unique_ptr wrap(strings_column_view const& strings, std::unique_ptr wrap(strings_column_view const& strings, size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::wrap(strings, width, cudf::get_default_stream(), mr); + return detail::wrap(strings, width, stream, mr); } } // namespace strings diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu new file mode 100644 index 00000000000..823e4472960 --- /dev/null +++ b/cpp/src/structs/scan/scan_inclusive.cu @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace structs { +namespace detail { +namespace { + +} // namespace + +template +std::unique_ptr scan_inclusive(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Create a gather map containing indices of the prefix min/max elements. + auto gather_map = rmm::device_uvector(input.size(), stream); + auto const binop_generator = + cudf::reduction::detail::comparison_binop_generator::create(input, stream); + thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + gather_map.begin(), + binop_generator.binop()); + + // Gather the children columns of the input column. Must use `get_sliced_child` to properly + // handle input in case it is a sliced view. + auto const input_children = [&] { + auto const it = cudf::detail::make_counting_transform_iterator( + 0, [structs_view = structs_column_view{input}, &stream](auto const child_idx) { + return structs_view.get_sliced_child(child_idx, stream); + }); + return std::vector(it, it + input.num_children()); + }(); + + // Gather the children elements of the prefix min/max struct elements for the output. + auto scanned_children = cudf::detail::gather(table_view{input_children}, + gather_map, + cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr) + ->release(); + + // Don't need to set a null mask because that will be handled at the caller. + return make_structs_column( + input.size(), std::move(scanned_children), 0, rmm::device_buffer{0, stream, mr}, stream, mr); +} + +template std::unique_ptr scan_inclusive(column_view const& input_view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +template std::unique_ptr scan_inclusive(column_view const& input_view, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace detail +} // namespace structs +} // namespace cudf diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu new file mode 100644 index 00000000000..5be35119003 --- /dev/null +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -0,0 +1,458 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvtext { +namespace detail { +namespace { + +constexpr int block_size = 512; + +/** + * @brief Produces offsets to unpairable locations in the given chars array + * + * Launched as a thread per byte of the chars array. + * The output is non-zero offsets to locations of unpairable substrings. + * An unpairable substring does not exist in the given map and so will + * never be paired. Fortunately, this can be used as an artificial + * boundary providing increased parallelism in the BPE kernel. + * + * @tparam MapRefType The type of the map finder object + */ +template +struct bpe_unpairable_offsets_fn { + cudf::device_span d_chars; + cudf::size_type offset; + MapRefType const d_map; + __device__ cudf::size_type operator()(cudf::size_type idx) + { + if (!cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { return 0; } + + auto const itr = d_chars.data() + idx; + auto const end = d_chars.end(); + auto const lhs = cudf::string_view(itr, cudf::strings::detail::bytes_in_utf8_byte(*itr)); + auto const next = itr + lhs.size_bytes(); + auto output = 0; + if (next < end) { + auto const rhs = cudf::string_view(next, cudf::strings::detail::bytes_in_utf8_byte(*next)); + // see if both halves exist anywhere in the table, if not these are unpairable + if (d_map.find(lhs) == d_map.end() && d_map.find(rhs) == d_map.end()) { + output = idx + lhs.size_bytes() + offset; // offset for artificial boundary + } + } + return output; + } +}; + +/** + * @brief Performs byte-pair-encoding + * + * Computes the locations where the separator will be inserted in `d_spaces_data`. + * This is launched as a string per block. + * + * The process first initializes all characters to 1 per position in `d_spaces_data`. + * All pairs are realized and their ranks stored in `d_ranks_data`. + * + * Iteratively, the minimum rank is located, the corresponding `d_spaces_data` location + * is set to 0 resulting in new potential pairs. The process repeats accounting for + * the rank of the newly formed pairs. + * + * Once there are no more rankable pairs, the process finishes and the `d_spaces_data` + * values identify the location to insert the separator. + * + * @tparam MapRefType The type of the map finder object + * @param d_strings Input data + * @param d_map For looking up individual string candidates + * @param d_spaces_data Output the location where separator will be inserted + * @param d_ranks_data Working memory to hold pair ranks + * @param d_rerank_data Working memory to hold locations where reranking is required + */ +template +__global__ void bpe_parallel_fn(cudf::column_device_view const d_strings, + MapRefType const d_map, + int8_t* d_spaces_data, // working memory + cudf::size_type* d_ranks_data, // more working memory + int8_t* d_rerank_data // and one more working memory +) +{ + // string per block + auto const str_idx = + static_cast(cudf::detail::grid_1d::global_thread_id() / block_size); + auto const lane_idx = static_cast(threadIdx.x); + + auto const d_str = d_strings.element(str_idx); + auto const offsets = + d_strings.child(cudf::strings_column_view::offsets_column_index).data(); + auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; + + auto const d_spaces = d_spaces_data + offset; + auto const end_spaces = d_spaces + d_str.size_bytes(); + auto const d_ranks = d_ranks_data + offset; + auto const end_ranks = d_ranks + d_str.size_bytes(); + auto const d_rerank = d_rerank_data + offset; + auto const end_rerank = d_rerank + d_str.size_bytes(); + + auto constexpr max_rank = cuda::std::numeric_limits::max(); + + __shared__ cudf::size_type block_min_rank; + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); + + // init all the re-rank identifiers to zero + for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { + *itr = 0; + } + // init all ranks to max + for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) { + *itr = max_rank; + } + // init all spaces to 1 as appropriate + for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { + auto const index = thrust::distance(d_spaces, itr); + *itr = static_cast(cudf::strings::detail::is_begin_utf8_char(d_str.data()[index])); + } + __syncthreads(); + + // for finding the next half of a pair + auto next_substr = [d_str, d_spaces, end = end_spaces](int8_t* begin) { + auto const next = thrust::find(thrust::seq, begin + 1, end, 1); + auto const size = static_cast(thrust::distance(begin, next)); + return cudf::string_view(d_str.data() + thrust::distance(d_spaces, begin), size); + }; + // for locating adjacent pairs after merging a pair + auto find_prev = [begin = d_spaces](int8_t* ptr) { + while (ptr > begin && *ptr == 0) { + --ptr; + } + return ptr; + }; + + auto min_rank = max_rank; + + // store all the initial ranks for each pair + // every character but the first one will have a initial rank + // + // Example: + // string: abcdefghij + // spaces: 1111111111 + // ranks: *948516327 + for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { + if (*itr == 0) { continue; } // skips any UTF-8 continuation bytes + // resolve pair and lookup its rank + auto const lhs = next_substr(itr); // retrieve lhs of the pair + auto const next_itr = itr + lhs.size_bytes(); + if (next_itr < end_spaces) { + auto const rhs = next_substr(next_itr); // retrieve rhs of the pair + if (!rhs.empty()) { + auto rank = max_rank; + auto const mp = merge_pair_type{lhs, rhs}; + auto const map_itr = d_map.find(mp); // lookup pair in merges table; + if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match; + d_ranks[thrust::distance(d_spaces, next_itr)] = rank; // store the rank + if (rank < min_rank) { min_rank = rank; } + } + } + } + // compute the min rank across the block + auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + if (lane_idx == 0) { block_min_rank = reduce_rank; } + __syncthreads(); + + // loop through the ranks processing the current minimum until there are no more + while (block_min_rank < max_rank) { + // search the d_ranks for matches to block_min_rank + for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) { + if (*itr == block_min_rank) { + auto ptr = itr - 1; // check for adjacent min-rank (edge-case) + while (ptr > d_ranks && *ptr == max_rank) { + --ptr; + } + // set the output value to 0 at this position (erases separator, merges pair) + // using example string above, the min-rank is 1 at position 5 + // string: abcdefghij + // spaces: 1111101111 (set position 5 to 0) + if (*ptr != block_min_rank) { d_spaces[thrust::distance(d_ranks, itr)] = 0; } + } + } + __syncthreads(); + + // identify all the re-rank locations (logic above invalidated adjacent pairs) + // using example string above, the adjacent pairs have to be re-ranked + // string: abcdefghij + // spaces: 1111101111 (pair 'e,f' is now merged) + // rerank: 0000101000 ('ef' and 'fg' need re-ranking as 'd,ef' and 'ef,g' + for (auto itr = d_ranks + lane_idx; itr < end_ranks; itr += block_size) { + auto const index = thrust::distance(d_ranks, itr); + if (*itr == block_min_rank && d_spaces[index] == 0) { + // find previous pair mid-point + auto ptr = find_prev(d_spaces + index - 1); + if (ptr > d_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } + // find next pair mid-point + ptr = thrust::find(thrust::seq, d_spaces + index + 1, end_spaces, 1); + if (ptr < end_spaces) { d_rerank[thrust::distance(d_spaces, ptr)] = 1; } + *itr = max_rank; // reset this rank + } + } + __syncthreads(); + + // compute the ranks for the newly created pairs + min_rank = max_rank; // and record the new minimum along the way + for (auto itr = d_rerank + lane_idx; itr < end_rerank; itr += block_size) { + auto const index = thrust::distance(d_rerank, itr); + auto rank = d_ranks[index]; + if (*itr) { + *itr = 0; // reset re-rank + // build lhs of pair + auto const ptr = find_prev(d_spaces + index - 1); + auto const size = static_cast(thrust::distance(ptr, d_spaces + index)); + auto const lhs = cudf::string_view(d_str.data() + thrust::distance(d_spaces, ptr), size); + auto const rhs = next_substr(d_spaces + index); // retrieve rhs of pair + rank = max_rank; + if (!rhs.empty()) { + auto const mp = merge_pair_type{lhs, rhs}; + auto const map_itr = d_map.find(mp); // lookup rank for this pair; + if (map_itr != d_map.end()) { rank = map_itr->second; } // found a match + } + d_ranks[index] = rank; // store new rank + } + if (rank < min_rank) { min_rank = rank; } + } + + // re-compute the minimum rank across the block (since new pairs are created above) + auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + if (lane_idx == 0) { block_min_rank = reduce_rank; } + __syncthreads(); + } // if no min ranks are found we are done, otherwise start again +} + +/** + * @brief Computes the output size of each strings row + * + * This launches as a string per block. + * The non-zero values in `d_spaces_data` for each string is added to + * the current string size to produce the total output bytes. + * + * @param d_strings Input data + * @param d_spaces_data Output the location where separator will be inserted + * @param d_sizes Output sizes of each row + */ +__global__ void bpe_finalize(cudf::column_device_view const d_strings, + int8_t* d_spaces_data, // where separators are inserted + cudf::size_type* d_sizes // output sizes of encoded strings +) +{ + // string per block + auto const str_idx = + static_cast(cudf::detail::grid_1d::global_thread_id() / block_size); + auto const lane_idx = static_cast(threadIdx.x); + + if (d_strings.is_null(str_idx)) { + d_sizes[str_idx] = 0; + return; + } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_sizes[str_idx] = 0; + return; + } + + auto const offsets = + d_strings.child(cudf::strings_column_view::offsets_column_index).data(); + auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; + + auto const d_spaces = d_spaces_data + offset; + auto const end_spaces = d_spaces + d_str.size_bytes(); + auto const num_valid = block_size < d_str.size_bytes() ? block_size : d_str.size_bytes(); + + using block_reduce = cub::BlockReduce; + __shared__ typename block_reduce::TempStorage temp_storage; + + // reset the first position -- no separator to be added here + if (lane_idx == 0) { *d_spaces = 0; } + + // compute the output size for this string by counting the resulting separator positions + auto bytes = 0; + for (auto itr = d_spaces + lane_idx; itr < end_spaces; itr += block_size) { + bytes += (*itr > 0); + } + auto const total_bytes = block_reduce(temp_storage).Sum(bytes, num_valid); + if (lane_idx == 0) { d_sizes[str_idx] = total_bytes + d_str.size_bytes(); } +} + +} // namespace + +std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, + bpe_merge_pairs const& merge_pairs, + cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty() || input.chars_size() == 0) { + return cudf::make_empty_column(cudf::type_id::STRING); + } + + CUDF_EXPECTS(separator.is_valid(stream), "separator parameter must be valid"); + auto const d_separator = separator.value(stream); + CUDF_EXPECTS(d_separator.size_bytes() == 1, "for now, separator must be a single-byte character"); + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::detail::get_value( + input.offsets(), input.offset(), stream); + auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) + ? input.chars().size() + : cudf::detail::get_value( + input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars().data() + first_offset; + + auto const offset_data_type = cudf::data_type{cudf::type_to_id()}; + auto offsets = cudf::make_numeric_column( + offset_data_type, input.size() + 1, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + rmm::device_uvector d_spaces(chars_size, stream); // identifies non-merged pairs + // used for various purposes below: unpairable-offsets, pair ranks, separator insert positions + rmm::device_uvector d_working(chars_size, stream); + + auto const chars_begin = thrust::counting_iterator(0); + auto const chars_end = thrust::counting_iterator(chars_size); + + { + // this kernel locates unpairable sections of strings to create artificial string row + // boundaries; the boundary values are recorded as offsets in d_up_offsets + auto const d_up_offsets = d_working.data(); // store unpairable offsets here + auto const mp_map = merge_pairs.impl->get_mp_table_ref(); // lookup table + auto const d_chars_span = cudf::device_span(d_input_chars, chars_size); + auto up_fn = bpe_unpairable_offsets_fn{d_chars_span, first_offset, mp_map}; + thrust::transform(rmm::exec_policy_nosync(stream), chars_begin, chars_end, d_up_offsets, up_fn); + auto const up_end = // remove all but the unpairable offsets + thrust::remove(rmm::exec_policy_nosync(stream), d_up_offsets, d_up_offsets + chars_size, 0); + auto const unpairables = thrust::distance(d_up_offsets, up_end); // number of unpairables + + // new string boundaries created by combining unpairable offsets with the existing offsets + auto tmp_offsets = rmm::device_uvector(unpairables + input.size() + 1, stream); + thrust::merge(rmm::exec_policy_nosync(stream), + input.offsets_begin(), + input.offsets_end(), + d_up_offsets, + up_end, + tmp_offsets.begin()); + // remove any adjacent duplicate offsets (i.e. empty or null rows) + auto const offsets_end = + thrust::unique(rmm::exec_policy_nosync(stream), tmp_offsets.begin(), tmp_offsets.end()); + auto const offsets_total = + static_cast(thrust::distance(tmp_offsets.begin(), offsets_end)); + tmp_offsets.resize(offsets_total, stream); + + // temp column created with the merged offsets and the original chars data + auto const col_offsets = + cudf::column_view(cudf::device_span(tmp_offsets)); + auto const tmp_size = offsets_total - 1; + auto const tmp_input = cudf::column_view( + input.parent().type(), tmp_size, nullptr, nullptr, 0, 0, {col_offsets, input.chars()}); + auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); + + // launch the byte-pair-encoding kernel on the temp column + rmm::device_uvector d_rerank(chars_size, stream); // more working memory; + auto const d_ranks = d_working.data(); // store pair ranks here + auto const pair_map = merge_pairs.impl->get_merge_pairs_ref(); + bpe_parallel_fn<<>>( + *d_tmp_strings, pair_map, d_spaces.data(), d_ranks, d_rerank.data()); + } + + // compute the output sizes and store them in the d_offsets vector + bpe_finalize<<>>( + *d_strings, d_spaces.data(), d_offsets); + + // convert sizes to offsets in-place + auto const bytes = + cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream); + CUDF_EXPECTS(bytes <= static_cast(std::numeric_limits::max()), + "Size of output exceeds the column size limit", + std::overflow_error); + + // build the output: inserting separators to the input character data + auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); + auto d_chars = chars->mutable_view().data(); + + auto const d_inserts = d_working.data(); // stores the insert positions + auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) { + return d_spaces[idx] > 0; // separator to be inserted here + }; + auto const copy_end = thrust::copy_if( + rmm::exec_policy_nosync(stream), chars_begin + 1, chars_end, d_inserts, offsets_at_non_zero); + + // this will insert the single-byte separator into positions specified in d_inserts + auto const sep_char = thrust::constant_iterator(separator.to_string(stream)[0]); + thrust::merge_by_key(rmm::exec_policy_nosync(stream), + d_inserts, // where to insert separator byte + copy_end, // + chars_begin, // all indices + chars_end, // + sep_char, // byte to insert + d_input_chars, // original data + thrust::make_discard_iterator(), + d_chars); // result + + return cudf::make_strings_column(input.size(), + std::move(offsets), + std::move(chars), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} + +} // namespace detail + +std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, + bpe_merge_pairs const& merges_table, + cudf::string_scalar const& separator, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr); +} + +} // namespace nvtext diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh new file mode 100644 index 00000000000..2a170317909 --- /dev/null +++ b/cpp/src/text/bpe/byte_pair_encoding.cuh @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +namespace nvtext { +namespace detail { + +using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; +using hash_value_type = string_hasher_type::result_type; +using merge_pair_type = thrust::pair; + +using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + +/** + * @brief Hasher function used for building and using the cuco static-map + * + * This takes advantage of heterogeneous lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (merge_pair_type). + * + * The merge-pairs are in adjacent rows so each index will access two rows of string values. + * The hash of each string is combined for the returned result. + */ +struct bpe_hasher { + cudf::column_device_view const d_strings; + string_hasher_type hasher{}; + // used by insert + __device__ hash_value_type operator()(cudf::size_type index) const + { + index *= 2; + auto const lhs = d_strings.element(index); + auto const rhs = d_strings.element(index + 1); + return cudf::hashing::detail::hash_combine(hasher(lhs), hasher(rhs)); + } + // used by find + __device__ hash_value_type operator()(merge_pair_type const& mp) const + { + return cudf::hashing::detail::hash_combine(hasher(mp.first), hasher(mp.second)); + } +}; + +/** + * @brief Equal function used for building and using the cuco static-map + * + * This takes advantage of heterogeneous lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (merge_pair_type). + * + * The merge-pairs are in adjacent rows so each index will access two rows of string values. + * All rows from the input merge-pairs are unique. + */ +struct bpe_equal { + cudf::column_device_view const d_strings; + // used by insert + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept + { + return lhs == rhs; // all rows are unique + } + // used by find + __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept + { + lhs *= 2; + auto const left = d_strings.element(lhs); + auto const right = d_strings.element(lhs + 1); + return (left == rhs.first) && (right == rhs.second); + } +}; + +using bpe_probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>; + +using merge_pairs_map_type = cuco::experimental::static_map, + cuda::thread_scope_device, + bpe_equal, + bpe_probe_scheme, + hash_table_allocator_type>; + +/** + * @brief Hasher function used for building and using the cuco static-map + * + * This takes advantage of heterogeneous lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (merge_pair_type). + * + * Each component of the merge-pairs (left and right) are stored individually in the map. + */ +struct mp_hasher { + cudf::column_device_view const d_strings; + string_hasher_type hasher{}; + // used by insert + __device__ hash_value_type operator()(cudf::size_type index) const + { + auto const d_str = d_strings.element(index); + return hasher(d_str); + } + // used by find + __device__ hash_value_type operator()(cudf::string_view const& d_str) const + { + return hasher(d_str); + } +}; + +/** + * @brief Equal function used for building and using the cuco static-map + * + * This takes advantage of heterogeneous lookup feature in cuco static-map which + * allows inserting with one type (index) and looking up with a different type (string). + */ +struct mp_equal { + cudf::column_device_view const d_strings; + // used by insert + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept + { + auto const left = d_strings.element(lhs); + auto const right = d_strings.element(rhs); + return left == right; + } + // used by find + __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept + { + auto const left = d_strings.element(lhs); + return left == rhs; + } +}; + +using mp_probe_scheme = cuco::experimental::linear_probing<1, mp_hasher>; + +using mp_table_map_type = cuco::experimental::static_map, + cuda::thread_scope_device, + mp_equal, + mp_probe_scheme, + hash_table_allocator_type>; + +} // namespace detail + +// since column_device_view::create() returns is a little more than +// std::unique_ptr this helper simplifies the return type for us +using col_device_view = std::invoke_result_t; + +struct bpe_merge_pairs::bpe_merge_pairs_impl { + std::unique_ptr const merge_pairs; + col_device_view const d_merge_pairs; + std::unique_ptr merge_pairs_map; // for BPE + std::unique_ptr mp_table_map; // for locating unpairables + + bpe_merge_pairs_impl(std::unique_ptr&& merge_pairs, + col_device_view&& d_merge_pairs, + std::unique_ptr&& merge_pairs_map, + std::unique_ptr&& mp_table_map); + + auto const get_merge_pairs() const { return *d_merge_pairs; } + auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); } + auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); } +}; + +} // namespace nvtext diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/bpe/load_merge_pairs.cu similarity index 67% rename from cpp/src/text/subword/load_merges_file.cu rename to cpp/src/text/bpe/load_merge_pairs.cu index db6ad2e2dd2..80073df5804 100644 --- a/cpp/src/text/subword/load_merges_file.cu +++ b/cpp/src/text/bpe/load_merge_pairs.cu @@ -14,22 +14,21 @@ * limitations under the License. */ -#include +#include -#include +#include #include #include #include #include +#include #include #include #include #include -#include - #include #include #include @@ -88,32 +87,51 @@ std::unique_ptr load_file_to_column(std::string const& filename_me std::unique_ptr initialize_merge_pairs_map( cudf::column_device_view const& input, rmm::cuda_stream_view stream) { - // Ensure capacity is at least (size/0.7) as documented here: - // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182 auto merge_pairs_map = std::make_unique( - static_cast(input.size() * 2), // capacity is 2x; + static_cast(input.size()), cuco::empty_key{-1}, - cuco::empty_value{-1}, // empty value is not used + cuco::empty_value{-1}, bpe_equal{input}, - probe_scheme{bpe_hasher{input}}, + bpe_probe_scheme{bpe_hasher{input}}, hash_table_allocator_type{default_allocator{}, stream}, stream.value()); auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); }); - merge_pairs_map->insert_async(iter, iter + input.size(), stream.value()); + merge_pairs_map->insert_async(iter, iter + (input.size() / 2), stream.value()); return merge_pairs_map; } +std::unique_ptr initialize_mp_table_map( + cudf::column_device_view const& input, rmm::cuda_stream_view stream) +{ + auto mp_table_map = std::make_unique( + static_cast(input.size()), + cuco::empty_key{-1}, + cuco::empty_value{-1}, + mp_equal{input}, + mp_probe_scheme{mp_hasher{input}}, + hash_table_allocator_type{default_allocator{}, stream}, + stream.value()); + + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); }); + + mp_table_map->insert_async(iter, iter + input.size(), stream.value()); + + return mp_table_map; +} + std::unique_ptr create_bpe_merge_pairs_impl( std::unique_ptr&& input, rmm::cuda_stream_view stream) { - auto d_input = cudf::column_device_view::create(input->view(), stream); - auto merge_pairs = initialize_merge_pairs_map(*d_input, stream); + auto d_input = cudf::column_device_view::create(input->view(), stream); + auto merge_pairs = initialize_merge_pairs_map(*d_input, stream); + auto mp_table_map = initialize_mp_table_map(*d_input, stream); return std::make_unique( - std::move(input), std::move(d_input), std::move(merge_pairs)); + std::move(input), std::move(d_input), std::move(merge_pairs), std::move(mp_table_map)); } std::unique_ptr create_bpe_merge_pairs_impl( @@ -121,8 +139,9 @@ std::unique_ptr create_bpe_merge_pairs_im rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return create_bpe_merge_pairs_impl(std::make_unique(input.parent(), stream, mr), - stream); + auto pairs = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr); + auto content = pairs->release(); + return create_bpe_merge_pairs_impl(std::move(content.children.back()), stream); } } // namespace @@ -135,6 +154,15 @@ std::unique_ptr load_merge_pairs_file(std::string const& filena return std::make_unique(std::move(input_column), stream, mr); } +std::unique_ptr load_merge_pairs(cudf::strings_column_view const& merge_pairs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty"); + CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls"); + return std::make_unique(merge_pairs, stream, mr); +} + } // namespace detail std::unique_ptr load_merge_pairs_file(std::string const& filename_merges, @@ -144,31 +172,42 @@ std::unique_ptr load_merge_pairs_file(std::string const& filena return detail::load_merge_pairs_file(filename_merges, cudf::get_default_stream(), mr); } +std::unique_ptr load_merge_pairs(cudf::strings_column_view const& merge_pairs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::load_merge_pairs(merge_pairs, stream, mr); +} + bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl( std::unique_ptr&& merge_pairs, std::unique_ptr>&& d_merge_pairs, - std::unique_ptr&& merge_pairs_map) + std::unique_ptr&& merge_pairs_map, + std::unique_ptr&& mp_table_map) : merge_pairs(std::move(merge_pairs)), d_merge_pairs(std::move(d_merge_pairs)), - merge_pairs_map(std::move(merge_pairs_map)) + merge_pairs_map(std::move(merge_pairs_map)), + mp_table_map(std::move(mp_table_map)) { } bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr&& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource*) - : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream)) + : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release()) { } bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr)) + : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release()) { } -bpe_merge_pairs::~bpe_merge_pairs() = default; +bpe_merge_pairs::bpe_merge_pairs() = default; +bpe_merge_pairs::~bpe_merge_pairs() { delete impl; } } // namespace nvtext diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index a17583cf649..38cb7dd6753 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -169,13 +169,14 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string } // namespace detail -std::unique_ptr detokenize(cudf::strings_column_view const& strings, +std::unique_ptr detokenize(cudf::strings_column_view const& input, cudf::column_view const& row_indices, cudf::string_scalar const& separator, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr); + return detail::detokenize(input, row_indices, separator, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index 1460be4fcf5..3d5f2d72e6f 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -224,7 +224,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con cudf::size_type n_upper = (strings_count * (strings_count - 1)) / 2; rmm::device_uvector offsets(n_upper, stream); auto d_offsets = offsets.data(); - CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(cudf::size_type), stream.value())); + CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, n_upper * sizeof(std::ptrdiff_t), stream.value())); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 1b07b0785f5..0fc1d221b15 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -242,22 +242,24 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // external APIs -std::unique_ptr normalize_spaces(cudf::strings_column_view const& strings, +std::unique_ptr normalize_spaces(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::normalize_spaces(strings, cudf::get_default_stream(), mr); + return detail::normalize_spaces(input, stream, mr); } /** * @copydoc nvtext::normalize_characters */ -std::unique_ptr normalize_characters(cudf::strings_column_view const& strings, +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, bool do_lower_case, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::normalize_characters(strings, do_lower_case, cudf::get_default_stream(), mr); + return detail::normalize_characters(input, do_lower_case, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 34916e121dc..a4b28fe2dab 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -274,26 +274,26 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str // external APIs -std::unique_ptr replace_tokens(cudf::strings_column_view const& strings, +std::unique_ptr replace_tokens(cudf::strings_column_view const& input, cudf::strings_column_view const& targets, cudf::strings_column_view const& replacements, cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_tokens( - strings, targets, replacements, delimiter, cudf::get_default_stream(), mr); + return detail::replace_tokens(input, targets, replacements, delimiter, stream, mr); } -std::unique_ptr filter_tokens(cudf::strings_column_view const& strings, +std::unique_ptr filter_tokens(cudf::strings_column_view const& input, cudf::size_type min_token_length, cudf::string_scalar const& replacement, cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::filter_tokens( - strings, min_token_length, replacement, delimiter, cudf::get_default_stream(), mr); + return detail::filter_tokens(input, min_token_length, replacement, delimiter, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu deleted file mode 100644 index 13c744ac6bd..00000000000 --- a/cpp/src/text/subword/bpe_tokenizer.cu +++ /dev/null @@ -1,564 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace nvtext { -namespace detail { - -namespace { - -template -constexpr bool is_whitespace(CharType ch) -{ - return ch <= ' '; -} - -/** - * @brief Resolve a substring up to the first whitespace character. - * - * This will return a substring of the input starting with the first byte - * up to the first whitespace character found or the end of the string. - * Any whitespace is expected only at the end of the string. - * - * @param d_str Input string to resolve. - * @return Substring of the input excluding any trailing whitespace. - */ -__device__ cudf::string_view get_first_token(cudf::string_view const& d_str) -{ - auto const begin = d_str.data(); - auto const end = thrust::find_if( - thrust::seq, begin, begin + d_str.size_bytes(), [](auto ch) { return is_whitespace(ch); }); - auto const size = static_cast(thrust::distance(begin, end)); - return cudf::string_view(begin, size); -} - -/** - * @brief Main byte pair encoding algorithm function for each string. - * - * @see The byte_pair_encoding_fn::operator() function below for details. - */ -template -struct byte_pair_encoding_fn { - cudf::column_device_view const d_merges; - cudf::column_device_view const d_strings; - MapRefType const d_map; - cudf::size_type* d_sizes; // output size of encoded string - string_hasher_type const hasher; - cudf::size_type* d_byte_indices; - - /** - * @brief Parse the merge pair into components. - * - * The two substrings are separated by a single space. - * - * @param idx Index of merge pair to dissect. - * @return The left and right halves of the merge pair. - */ - __device__ thrust::pair dissect_merge_pair( - cudf::size_type idx) - { - auto const d_pair = d_merges.element(idx); - auto const lhs = d_pair.data(); - auto const end_str = d_pair.data() + d_pair.size_bytes(); - auto const rhs = thrust::find(thrust::seq, lhs, end_str, ' '); // space always expected - // check for malformed pair entry to prevent segfault - if (rhs == end_str) { return thrust::make_pair(cudf::string_view{}, cudf::string_view{}); } - auto const lhs_size = static_cast(thrust::distance(lhs, rhs)); - auto const rhs_size = static_cast(thrust::distance(rhs + 1, end_str)); - return thrust::make_pair(cudf::string_view(lhs, lhs_size), - cudf::string_view(rhs + 1, rhs_size)); - } - - /** - * @brief Get the next substring of the given string. - * - * This will find the next sequence of characters identified by the - * given byte indices iterator values. The beginning of the sequence - * starts at `begin` and the end of the sequence is the first non-zero - * index found between (begin,end) exclusive. - * - * @tparam Iterator The byte indices iterator type - * @param begin Start of indices to check - * @param end End of indices to check - * @param d_str String to substring - * @return The substring found. - */ - template - __device__ cudf::string_view next_substr(Iterator begin, - Iterator end, - cudf::string_view const& d_str) - { - auto const next = thrust::find_if(thrust::seq, begin + 1, end, [](auto v) { return v != 0; }); - auto const size = static_cast(thrust::distance(begin, next)); - return cudf::string_view(d_str.data() + *begin, size); - } - - /** - * @brief Look up the pair of strings in the d_map/d_merges - * - * @param lhs Left half of the string - * @param rhs Right half of the string - * @return Position of merge pair within d_map - */ - __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs) - { - __shared__ char shmem[48 * 1024]; // max for Pascal - auto const total_size = lhs.size_bytes() + rhs.size_bytes() + 1; - auto const thread_memory_size = static_cast(sizeof(shmem) / blockDim.x); - - // Edge case check. - // Empirically found only two merge pair strings that were greater than 70 bytes - // and they both looked like ignorable errors. - if (thread_memory_size < total_size) { return d_map.end(); } - - // build the target string in shared memory - char* ptr = &shmem[threadIdx.x * thread_memory_size]; - - // build a temp string like: temp = lhs + ' ' + rhs - memcpy(ptr, lhs.data(), lhs.size_bytes()); - memcpy(ptr + lhs.size_bytes(), " ", 1); - memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes()); - - auto const d_str = cudf::string_view(ptr, total_size); - return d_map.find(d_str); - } - - /** - * @brief Byte encode each string. - * - * Each string is iteratively scanned for the minimum rank of adjacent substring pairs - * as found within the `d_map` table. Once the minimum pair is located, that pair - * is removed -- virtually by zero-ing the index value between any matching adjacent pairs. - * - * The iteration ends once there are no more adjacent pairs or there are no more - * matches found in `d_map`. At the end, the indices for each string reflect the - * encoding pattern and can be used to build the output. - * - * This function also computes the size of the encoded output of each string - * by simply counting the number of non-zero indices values remaining. This saves - * an extra kernel launch normally required to compute the offsets of the output column. - * - * @param idx The index of the string in `d_strings` to encode - */ - __device__ void operator()(cudf::size_type idx) - { - if (d_strings.is_null(idx)) { - d_sizes[idx] = 0; - return; - } - auto const d_str = get_first_token(d_strings.element(idx)); - if (d_str.empty()) { - d_sizes[idx] = 0; - return; - } - - auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); - auto const d_indices = d_byte_indices + offset; - - // initialize the byte indices for this string; - // set the index value to 0 for any intermediate UTF-8 bytes - thrust::transform(thrust::seq, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(d_str.size_bytes()), - d_indices, - [data = d_str.data()](auto idx) { - auto const byte = static_cast(data[idx]); - return cudf::strings::detail::is_begin_utf8_char(byte) ? idx : 0; - }); - - auto const begin = d_indices; - auto const end = d_indices + d_str.size_bytes(); - - // keep processing the string until there are no more adjacent pairs found in d_map - cudf::size_type min_rank = 0; - while (min_rank < cuda::std::numeric_limits::max()) { - // initialize working variables - min_rank = cuda::std::numeric_limits::max(); - - auto lhs = next_substr(begin, end, d_str); - auto itr = begin + lhs.size_bytes(); - - auto min_itr = itr; // these are set along with - auto min_size = lhs.size_bytes(); // the min_rank variable - - // check each adjacent pair against the d_map - while (itr < end) { - auto const rhs = next_substr(itr, end, d_str); - if (rhs.empty()) break; // no more adjacent pairs - - auto const map_itr = get_merge_pair(lhs, rhs); - if (map_itr != d_map.end()) { - // found a match; record the rank (and other min_ vars) - auto const rank = map_itr->second; - if (rank < min_rank) { - min_rank = rank; - min_itr = itr; - min_size = rhs.size_bytes(); - } - } - // next substring - lhs = rhs; - itr += rhs.size_bytes(); - } - - // if any pair matched, remove every occurrence from the string - if (min_rank < cuda::std::numeric_limits::max()) { - // remove the first pair we found - itr = min_itr; - *itr = 0; - - // continue scanning for other occurrences in the remainder of the string - itr += min_size; - if (itr < end) { - auto const d_pair = dissect_merge_pair(min_rank); - - lhs = next_substr(itr, end, d_str); - itr += lhs.size_bytes(); - while (itr < end) { - auto rhs = next_substr(itr, end, d_str); - if (d_pair.first == lhs && d_pair.second == rhs) { - *itr = 0; // removes the pair from this string - itr += rhs.size_bytes(); - if (itr >= end) { break; } // done checking for pairs - // skip to the next adjacent pair - rhs = next_substr(itr, end, d_str); - } - // next substring - lhs = rhs; - itr += rhs.size_bytes(); - } - } - } - } - - // compute and store the output size for this string's encoding - auto const encoded_size = d_str.size_bytes() + // number of original bytes + - thrust::count_if( // number of non-zero byte indices - thrust::seq, - d_indices, - d_indices + d_str.size_bytes(), - [](auto v) { return v != 0; }); - d_sizes[idx] = static_cast(encoded_size); - } -}; - -/** - * @brief Build the output string encoding. - * - * This copies each string to the output inserting a space at each non-zero byte index. - * - * @code{.txt} - * d_strings = ["helloworld", "testthis"] - * d_byte_indices = [ 0000050000 00004000] - * result is ["hello world", "test this"] - * @endcode - */ -struct build_encoding_fn { - cudf::column_device_view const d_strings; - cudf::size_type const* d_byte_indices; - cudf::size_type const* d_offsets; - char* d_chars{}; - - __device__ void operator()(cudf::size_type idx) - { - if (d_strings.is_null(idx)) { return; } - auto const d_str = get_first_token(d_strings.element(idx)); - if (d_str.empty()) { return; } - - auto const offset = d_strings.child(cudf::strings_column_view::offsets_column_index) - .element(idx); - auto const d_indices = d_byte_indices + offset; - auto d_output = d_chars ? d_chars + d_offsets[idx] : nullptr; - - // copy chars while indices[i]==0, - // insert space each time indices[i]!=0 - auto const begin = d_indices; - auto const end = d_indices + d_str.size_bytes(); - auto d_input = d_str.data(); - *d_output++ = *d_input++; - auto itr = begin + 1; - while (itr < end) { - if (*itr++) *d_output++ = ' '; - *d_output++ = *d_input++; - } - // https://github.com/rapidsai/cudf/pull/10270/files#r826319405 - } -}; - -/** - * @brief Perform byte pair encoding on each string in the input column. - * - * The result is a strings column of the same size where each string has been encoded. - * - * The encoding is performed iteratively. Each pass determines the string's lowest - * ranked merge pair as determined by the strings in `merges_table`. This pair - * is removed (virtually) from each string before starting the next iteration. - * - * Once all pairs have exhausted for all strings, the output is constructed from - * the results by adding spaces between each remaining pair in each string. - * - * @param input Strings to encode. - * @param merge_pairs Merge pairs data and map used for encoding. - * @param stream CUDA stream used for device memory operations and kernel launches - */ -std::unique_ptr byte_pair_encoding( - cudf::strings_column_view const& input, - bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs, - rmm::cuda_stream_view stream) -{ - auto const d_merges = merge_pairs.get_merge_pairs(); - CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty"); - - // build working vector to hold index values per byte - rmm::device_uvector d_byte_indices(input.chars().size(), stream); - - auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - - auto offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, - static_cast(input.size() + 1), - cudf::mask_state::UNALLOCATED, - stream, - rmm::mr::get_current_device_resource()); - auto d_offsets = offsets->mutable_view().data(); - - auto map_ref = merge_pairs.get_merge_pairs_ref(); - byte_pair_encoding_fn fn{ - d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()}; - thrust::for_each_n( - rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), fn); - - // build the output: add spaces between the remaining pairs in each string - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + input.size() + 1, d_offsets); - - auto const bytes = - cudf::detail::get_value(offsets->view(), input.size(), stream); - auto chars = cudf::strings::detail::create_chars_child_column( - bytes, stream, rmm::mr::get_current_device_resource()); - auto d_chars = chars->mutable_view().data(); - - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - build_encoding_fn{*d_strings, d_byte_indices.data(), d_offsets, d_chars}); - - return make_strings_column( - input.size(), std::move(offsets), std::move(chars), 0, rmm::device_buffer{}); -} - -/** - * @brief Detect space to not-space transitions inside each string. - * - * This handles sliced input and null strings as well. - * It is parallelized over bytes and returns true only for valid left edges - * -- non-space preceded by a space. - */ -struct edge_of_space_fn { - cudf::column_device_view const d_strings; - __device__ bool operator()(cudf::size_type offset) - { - auto const d_chars = - d_strings.child(cudf::strings_column_view::chars_column_index).data(); - if (is_whitespace(d_chars[offset]) || !is_whitespace(d_chars[offset - 1])) { return false; } - - auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); - auto const d_offsets = offsets.data() + d_strings.offset(); - // ignore offsets outside sliced range - if (offset < d_offsets[0] || offset >= d_offsets[d_strings.size()]) { return false; } - - auto itr = - thrust::lower_bound(thrust::seq, d_offsets, d_offsets + d_strings.size() + 1, offset); - // ignore offsets at existing string boundaries - if (*itr == offset) { return false; } - - // count only edges for valid strings - auto const index = static_cast(thrust::distance(d_offsets, itr)) - 1; - return d_strings.is_valid(index); - } -}; - -/** - * @brief Create new offsets by identifying substrings by whitespace. - * - * This is similar to cudf::strings::split_record but does not fully split - * and only returns new offsets. The behavior is more like a view-only slice - * of the chars child with the result still including trailing delimiters. - * - * The encoding algorithm ignores the trailing whitespace of each string. - * - * @param input Strings to tokenize. - * @param stream CUDA stream used for device memory operations and kernel launches - * @return New offsets including those at the edge of each space. - */ -std::unique_ptr space_offsets(cudf::strings_column_view const& input, - cudf::column_device_view const& d_strings, - rmm::cuda_stream_view stream) -{ - // count space offsets - auto const begin = thrust::make_counting_iterator(1); - auto const end = thrust::make_counting_iterator(input.chars().size()); - edge_of_space_fn edge_of_space{d_strings}; - auto const space_count = thrust::count_if(rmm::exec_policy(stream), begin, end, edge_of_space); - - // copy space offsets - rmm::device_uvector space_offsets(space_count, stream); - thrust::copy_if(rmm::exec_policy(stream), begin, end, space_offsets.data(), edge_of_space); - - // create output offsets - auto result = - cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, - static_cast(space_count + input.size() + 1), - cudf::mask_state::UNALLOCATED, - stream, - rmm::mr::get_current_device_resource()); - - // combine current offsets with space offsets - thrust::merge(rmm::exec_policy(stream), - input.offsets_begin(), - input.offsets_end(), - space_offsets.begin(), - space_offsets.end(), - result->mutable_view().begin()); - - return result; -} - -/** - * @brief Build new offsets that can be used to build a list column for calling join. - * - * This essentially returns the number of tokens for each string. - */ -struct list_offsets_fn { - cudf::column_device_view const d_strings; - __device__ cudf::size_type operator()(cudf::size_type idx) - { - if (d_strings.is_null(idx)) return 0; - auto const d_str = d_strings.element(idx); - if (d_str.empty()) return 1; // empty is a single valid result - - auto const begin = thrust::make_counting_iterator(1); - auto const end = thrust::make_counting_iterator(d_str.size_bytes()); - - // this counts the number of non-adjacent delimiters - auto const result = - thrust::count_if(thrust::seq, begin, end, [data = d_str.data()](auto chidx) { - return !is_whitespace(data[chidx]) && is_whitespace(data[chidx - 1]); - }); - return static_cast(result) + 1; - } -}; - -} // namespace - -std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, - bpe_merge_pairs const& merge_pairs, - cudf::string_scalar const& separator, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (input.is_empty() || input.chars_size() == 0) - return cudf::make_empty_column(cudf::type_id::STRING); - - auto const d_strings = cudf::column_device_view::create(input.parent(), stream); - auto const offsets = space_offsets(input, *d_strings, stream); - - // build a view using the new offsets and the current input chars column - auto const input_view = cudf::column_view(cudf::data_type{cudf::type_id::STRING}, - offsets->size() - 1, - nullptr, // no parent data - nullptr, // null-mask - 0, // null-count - 0, // offset - {offsets->view(), input.chars()}); - - // run BPE on this view - auto const bpe_column = - byte_pair_encoding(cudf::strings_column_view(input_view), *(merge_pairs.impl), stream); - - // recombine the result: - // compute the offsets needed to build a list view - auto const list_offsets = [d_strings = *d_strings, stream] { - auto offsets_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), list_offsets_fn{d_strings}); - auto offsets_column = std::get<0>(cudf::detail::make_offsets_child_column( - offsets_itr, offsets_itr + d_strings.size(), stream, rmm::mr::get_current_device_resource())); - return offsets_column; - }(); - - // build a list column_view using the BPE output and the list_offsets - auto const list_join = cudf::column_view(cudf::data_type{cudf::type_id::LIST}, - input.size(), - nullptr, // no parent data in list column - input.null_mask(), - input.null_count(), - 0, - {list_offsets->view(), bpe_column->view()}); - - // build the output strings column - auto result = - cudf::strings::detail::join_list_elements(cudf::lists_column_view(list_join), - separator, - cudf::string_scalar(""), - cudf::strings::separator_on_nulls::NO, - cudf::strings::output_if_empty_list::EMPTY_STRING, - stream, - mr); - return result; -} - -} // namespace detail - -std::unique_ptr byte_pair_encoding(cudf::strings_column_view const& input, - bpe_merge_pairs const& merges_table, - cudf::string_scalar const& separator, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr); -} - -} // namespace nvtext diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh deleted file mode 100644 index 2fa879ea734..00000000000 --- a/cpp/src/text/subword/bpe_tokenizer.cuh +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -#include -#include - -namespace nvtext { -namespace detail { - -using hash_value_type = uint32_t; -using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; - -/** - * @brief Hasher function used for building and using the cuco static-map - * - * This takes advantage of heterogeneous lookup feature in cuco static-map which - * allows inserting with one type (index) and looking up with a different type (string). - */ -struct bpe_hasher { - cudf::column_device_view const d_strings; - string_hasher_type hasher{}; - // used by insert - __device__ hash_value_type operator()(cudf::size_type index) const - { - return hasher(d_strings.element(index)); - } - // used by find - __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); } -}; - -/** - * @brief Equal function used for building and using the cuco static-map - * - * This takes advantage of heterogeneous lookup feature in cuco static-map which - * allows inserting with one type (index) and looking up with a different type (string). - */ -struct bpe_equal { - cudf::column_device_view const d_strings; - // used by insert - __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept - { - return d_strings.element(lhs) == d_strings.element(rhs); - } - // used by find - __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept - { - return d_strings.element(lhs) == rhs; - } -}; - -using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; - -using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>; - -using merge_pairs_map_type = cuco::experimental::static_map, - cuda::thread_scope_device, - bpe_equal, - probe_scheme, - hash_table_allocator_type>; - -} // namespace detail - -// since column_device_view::create returns is a little more than -// std::unique_ptr this helper simplifies the return type in a more maintainable -// way -using col_device_view = std::invoke_result_t; - -struct bpe_merge_pairs::bpe_merge_pairs_impl { - std::unique_ptr const merge_pairs; - col_device_view const d_merge_pairs; - std::unique_ptr merge_pairs_map; - - bpe_merge_pairs_impl(std::unique_ptr&& merge_pairs, - col_device_view&& d_merge_pairs, - std::unique_ptr&& merge_pairs_map); - - auto const get_merge_pairs() const { return *d_merge_pairs; } - auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); } -}; - -} // namespace nvtext diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index 16b9f25b802..87f6a61a533 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -232,43 +232,48 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // external APIs -std::unique_ptr tokenize(cudf::strings_column_view const& strings, +std::unique_ptr tokenize(cudf::strings_column_view const& input, cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::tokenize(strings, delimiter, cudf::get_default_stream(), mr); + return detail::tokenize(input, delimiter, stream, mr); } -std::unique_ptr tokenize(cudf::strings_column_view const& strings, +std::unique_ptr tokenize(cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::tokenize(strings, delimiters, cudf::get_default_stream(), mr); + return detail::tokenize(input, delimiters, stream, mr); } -std::unique_ptr count_tokens(cudf::strings_column_view const& strings, +std::unique_ptr count_tokens(cudf::strings_column_view const& input, cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_tokens(strings, delimiter, cudf::get_default_stream(), mr); + return detail::count_tokens(input, delimiter, stream, mr); } -std::unique_ptr count_tokens(cudf::strings_column_view const& strings, +std::unique_ptr count_tokens(cudf::strings_column_view const& input, cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_tokens(strings, delimiters, cudf::get_default_stream(), mr); + return detail::count_tokens(input, delimiters, stream, mr); } -std::unique_ptr character_tokenize(cudf::strings_column_view const& strings, +std::unique_ptr character_tokenize(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::character_tokenize(strings, cudf::get_default_stream(), mr); + return detail::character_tokenize(input, stream, mr); } } // namespace nvtext diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index f998c9ec239..511f1995374 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -21,10 +21,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -37,6 +39,15 @@ #include +#include +#include +#include +#include +#include +#include + +#include + namespace nvtext { namespace detail { namespace { @@ -162,6 +173,123 @@ std::unique_ptr load_vocabulary(cudf::strings_column_view c namespace detail { namespace { +/** + * @brief Threshold to decide on using string or warp parallel functions. + * + * If the average byte length of a string in a column exceeds this value then + * the warp-parallel function is used to compute the output sizes. + * Otherwise, a regular string-parallel function is used. + * + * This value was found using the vocab_tokenize benchmark results. + */ +constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 128; + +constexpr int block_size = 256; + +__device__ bool is_delimiter(cudf::string_view const& d_delimiters, cudf::char_utf8 chr) +{ + return d_delimiters.empty() ? (chr <= ' ') : // whitespace check + thrust::any_of(thrust::seq, + d_delimiters.begin(), + d_delimiters.end(), + [chr] __device__(cudf::char_utf8 c) { return c == chr; }); +} + +struct mark_delimiters_fn { + char const* d_chars; + cudf::string_view const d_delimiter; + int8_t* d_results; + + __device__ void operator()(cudf::size_type idx) const + { + auto const ptr = d_chars + idx; + if (cudf::strings::detail::is_utf8_continuation_char(*ptr)) { return; } + cudf::char_utf8 chr = 0; + auto ch_size = cudf::strings::detail::to_char_utf8(ptr, chr); + auto const output = is_delimiter(d_delimiter, chr); + while (ch_size > 0) { + d_results[idx++] = output; + --ch_size; + } + } +}; + +__global__ void token_counts_fn(cudf::column_device_view const d_strings, + cudf::string_view const d_delimiter, + cudf::size_type* d_counts, + int8_t* d_results) +{ + // string per warp + auto const idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + if (idx >= (static_cast(d_strings.size()) * + static_cast(cudf::detail::warp_size))) { + return; + } + auto const str_idx = static_cast(idx / cudf::detail::warp_size); + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + + if (d_strings.is_null(str_idx)) { + d_counts[str_idx] = 0; + return; + } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { + d_counts[str_idx] = 0; + return; + } + + auto const offsets = + d_strings.child(cudf::strings_column_view::offsets_column_index).data(); + auto const offset = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()]; + auto const chars_begin = + d_strings.child(cudf::strings_column_view::chars_column_index).data() + + offsets[d_strings.offset()]; + + auto const begin = d_str.data(); + auto const end = begin + d_str.size_bytes(); + auto const d_output = d_results + offset; + auto const d_output_end = d_output + d_str.size_bytes(); + + using warp_reduce = cub::WarpReduce; + __shared__ typename warp_reduce::TempStorage warp_storage; + + cudf::size_type count = 0; + if (lane_idx == 0) { + cudf::char_utf8 chr = 0; + auto ch_size = cudf::strings::detail::to_char_utf8(begin, chr); + auto output = 1; + if (begin > chars_begin) { + auto ptr = begin - 1; + while (ptr > chars_begin && cudf::strings::detail::is_utf8_continuation_char(*ptr)) { + --ptr; + } + cudf::strings::detail::to_char_utf8(ptr, chr); + output = !is_delimiter(d_delimiter, chr); + } + auto ptr = d_output; + while (ch_size > 0) { + *ptr++ = output; + --ch_size; + } + count = ((begin + ch_size) == end); + } + __syncwarp(); + + for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) { + // add one if at the edge of a token or if at the string's end + if (*itr) { + count += !(*(itr - 1)); + } else { + count += (itr + 1 == d_output_end); + } + } + __syncwarp(); + + // add up the counts from the other threads to compute the total token count for this string + auto const total_count = warp_reduce(warp_storage).Reduce(count, cub::Sum()); + if (lane_idx == 0) { d_counts[str_idx] = total_count; } +} + /** * @brief Tokenizes each string and uses the map to assign token id values * @@ -197,6 +325,33 @@ struct vocabulary_tokenizer_fn { } }; +template +struct transform_tokenizer_fn { + cudf::string_view const d_delimiter; + MapRefType d_map; + cudf::size_type const default_id; + + __device__ cudf::size_type operator()(cudf::string_view d_str) const + { + auto const begin = d_str.data(); + auto const end = begin + d_str.size_bytes(); + + auto itr = begin; + while (itr < end) { + cudf::char_utf8 chr = 0; + auto const ch_size = cudf::strings::detail::to_char_utf8(itr, chr); + if (!is_delimiter(d_delimiter, chr)) break; + itr += ch_size; + } + + auto const size = static_cast(thrust::distance(itr, end)); + auto const token = cudf::string_view{itr, size}; + // lookup token in map + auto const fitr = d_map.find(token); + return (fitr != d_map.end()) ? fitr->second : default_id; + } +}; + } // namespace std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view const& input, @@ -209,28 +364,94 @@ std::unique_ptr tokenize_with_vocabulary(cudf::strings_column_view CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); auto const output_type = cudf::data_type{cudf::type_to_id()}; - if (input.is_empty()) { return cudf::make_empty_column(output_type); } + if (input.size() == input.null_count()) { return cudf::make_empty_column(output_type); } // count the tokens per string and build the offsets from the counts auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto const d_delimiter = delimiter.value(stream); - auto const sizes_itr = - cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter}); - auto [token_offsets, total_count] = - cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); + auto map_ref = vocabulary._impl->get_map_ref(); + auto const zero_itr = thrust::make_counting_iterator(0); + + if ((input.chars_size() / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) { + auto const sizes_itr = + cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{*d_strings, d_delimiter}); + auto [token_offsets, total_count] = + cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr); + + // build the output column to hold all the token ids + auto tokens = cudf::make_numeric_column( + output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr); + auto d_tokens = tokens->mutable_view().data(); + auto d_offsets = token_offsets->view().data(); + vocabulary_tokenizer_fn tokenizer{ + *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens}; + thrust::for_each_n(rmm::exec_policy(stream), zero_itr, input.size(), tokenizer); + return cudf::make_lists_column(input.size(), + std::move(token_offsets), + std::move(tokens), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); + } + + // longer strings perform better with warp-parallel approach + + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::detail::get_value( + input.offsets(), input.offset(), stream); + auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) + ? input.chars().size() + : cudf::detail::get_value( + input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars().data() + first_offset; + + rmm::device_uvector d_token_counts(input.size(), stream); + rmm::device_uvector d_marks(chars_size, stream); + + // mark position of all delimiters + thrust::for_each_n(rmm::exec_policy(stream), + zero_itr, + chars_size, + mark_delimiters_fn{d_input_chars, d_delimiter, d_marks.data()}); + + // launch warp per string to compute token counts + cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + token_counts_fn<<>>( + *d_strings, d_delimiter, d_token_counts.data(), d_marks.data()); + auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column( + d_token_counts.begin(), d_token_counts.end(), stream, mr); + + rmm::device_uvector d_tmp_offsets(total_count + 1, stream); + d_tmp_offsets.set_element(total_count, chars_size, stream); + thrust::copy_if(rmm::exec_policy(stream), + zero_itr, + thrust::counting_iterator(chars_size), + d_tmp_offsets.begin(), + [d_marks = d_marks.data()] __device__(auto idx) { + if (idx == 0) return true; + return d_marks[idx] && !d_marks[idx - 1]; + }); + + auto tmp_offsets = + std::make_unique(std::move(d_tmp_offsets), rmm::device_buffer{}, 0); + auto tmp_chars = cudf::column_view(input.chars().type(), chars_size, d_input_chars, nullptr, 0); + auto const tmp_input = cudf::column_view( + input.parent().type(), total_count, nullptr, nullptr, 0, 0, {tmp_offsets->view(), tmp_chars}); + + auto const d_tmp_strings = cudf::column_device_view::create(tmp_input, stream); - // build the output column to hold all the token ids auto tokens = cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr); - auto map_ref = vocabulary._impl->get_map_ref(); - auto d_offsets = token_offsets->view().data(); - auto d_tokens = tokens->mutable_view().data(); - vocabulary_tokenizer_fn tokenizer{ - *d_strings, d_delimiter, map_ref, default_id, d_offsets, d_tokens}; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - input.size(), - tokenizer); + auto d_tokens = tokens->mutable_view().data(); + + transform_tokenizer_fn tokenizer{d_delimiter, map_ref, default_id}; + thrust::transform(rmm::exec_policy(stream), + d_tmp_strings->begin(), + d_tmp_strings->end(), + d_tokens, + tokenizer); return cudf::make_lists_column(input.size(), std::move(token_offsets), diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index 1c81f266200..8421f32056e 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -194,7 +194,7 @@ std::unique_ptr rescale(column_view input, auto const scalar = make_fixed_point_scalar(0, scale_type{scale}, stream); auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr); if (input.nullable()) { - auto const null_mask = copy_bitmask(input, stream, mr); + auto const null_mask = detail::copy_bitmask(input, stream, mr); output_column->set_null_mask(std::move(null_mask), input.null_count()); } return output_column; @@ -255,7 +255,7 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; @@ -285,7 +285,7 @@ struct dispatch_unary_cast_to { std::make_unique(type, size, rmm::device_buffer{size * cudf::size_of(type), stream, mr}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; @@ -334,7 +334,7 @@ struct dispatch_unary_cast_to { auto output = std::make_unique(cudf::data_type{type.id(), input.type().scale()}, size, rmm::device_buffer{size * cudf::size_of(type), stream}, - copy_bitmask(input, stream, mr), + detail::copy_bitmask(input, stream, mr), input.null_count()); mutable_column_view output_mutable = *output; @@ -415,10 +415,11 @@ std::unique_ptr cast(column_view const& input, std::unique_ptr cast(column_view const& input, data_type type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::cast(input, type, cudf::get_default_stream(), mr); + return detail::cast(input, type, stream, mr); } } // namespace cudf diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index d0cae81a9c8..88922362319 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -291,8 +291,12 @@ std::unique_ptr unary_op_with(column_view const& input, std::is_same_v>)) return std::make_unique(input, stream, mr); - auto result = cudf::make_fixed_width_column( - input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); + auto result = cudf::make_fixed_width_column(input.type(), + input.size(), + detail::copy_bitmask(input, stream, mr), + input.null_count(), + stream, + mr); auto out_view = result->mutable_view(); @@ -642,10 +646,11 @@ std::unique_ptr unary_operation(cudf::column_view const& input, std::unique_ptr unary_operation(cudf::column_view const& input, cudf::unary_operator op, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::unary_operation(input, op, cudf::get_default_stream(), mr); + return detail::unary_operation(input, op, stream, mr); } } // namespace cudf diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu index 2cf83466b03..092ad3b6731 100644 --- a/cpp/src/unary/nan_ops.cu +++ b/cpp/src/unary/nan_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -91,17 +91,20 @@ std::unique_ptr is_not_nan(cudf::column_view const& input, } // namespace detail -std::unique_ptr is_nan(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) +std::unique_ptr is_nan(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_nan(input, cudf::get_default_stream(), mr); + return detail::is_nan(input, stream, mr); } std::unique_ptr is_not_nan(cudf::column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_not_nan(input, cudf::get_default_stream(), mr); + return detail::is_not_nan(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu index e64c68fdae6..6bdd65dd42d 100644 --- a/cpp/src/unary/null_ops.cu +++ b/cpp/src/unary/null_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,17 +55,20 @@ std::unique_ptr is_valid(cudf::column_view const& input, } // namespace detail -std::unique_ptr is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr) +std::unique_ptr is_null(cudf::column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_null(input, cudf::get_default_stream(), mr); + return detail::is_null(input, stream, mr); } std::unique_ptr is_valid(cudf::column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_valid(input, cudf::get_default_stream(), mr); + return detail::is_valid(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp index bc10dd7845a..b0078ff85a2 100644 --- a/cpp/src/utilities/traits.cpp +++ b/cpp/src/utilities/traits.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -158,6 +158,19 @@ struct is_integral_impl { bool is_integral(data_type type) { return cudf::type_dispatcher(type, is_integral_impl{}); } +struct is_integral_not_bool_impl { + template + constexpr bool operator()() + { + return is_integral_not_bool(); + } +}; + +bool is_integral_not_bool(data_type type) +{ + return cudf::type_dispatcher(type, is_integral_not_bool_impl{}); +} + struct is_floating_point_impl { template constexpr bool operator()() diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 04939f3cd6d..1be8566fb0f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -357,6 +357,7 @@ ConfigureTest( ConfigureTest( UTILITIES_TEST utilities_tests/type_list_tests.cpp + utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp utilities_tests/lists_column_wrapper_tests.cpp @@ -392,6 +393,7 @@ set_tests_properties( ConfigureTest( ITERATOR_TEST iterator/indexalator_test.cu + iterator/offsetalator_test.cu iterator/optional_iterator_test_chrono.cu iterator/optional_iterator_test_numeric.cu iterator/pair_iterator_test_chrono.cu @@ -522,7 +524,6 @@ ConfigureTest( strings/format_lists_tests.cpp strings/integers_tests.cpp strings/ipv4_tests.cpp - strings/json_tests.cpp strings/like_tests.cpp strings/pad_tests.cpp strings/repeat_strings_tests.cpp @@ -536,6 +537,10 @@ ConfigureTest( strings/urls_tests.cpp ) +# ################################################################################################## +# * json path test -------------------------------------------------------------------------------- +ConfigureTest(JSON_PATH_TEST json/json_tests.cpp) + # ################################################################################################## # * structs test ---------------------------------------------------------------------------------- ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_tests.cpp) @@ -616,27 +621,53 @@ ConfigureTest( # * bin tests ---------------------------------------------------------------------------------- ConfigureTest(LABEL_BINS_TEST labeling/label_bins_tests.cpp) +# ################################################################################################## +# * jit tests ---------------------------------------------------------------------------------- +ConfigureTest(JIT_PARSER_TEST jit/parse_ptx_function.cpp) +target_include_directories(JIT_PARSER_TEST PRIVATE "$") + # ################################################################################################## # * stream testing --------------------------------------------------------------------------------- ConfigureTest( STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu ) +ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest( - STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE + STREAM_STRINGS_TEST + streams/strings/case_test.cpp + streams/strings/combine_test.cpp + streams/strings/contains_test.cpp + streams/strings/convert_test.cpp + streams/strings/extract_test.cpp + streams/strings/filter_test.cpp + streams/strings/find_test.cpp + streams/strings/replace_test.cpp + streams/strings/reverse_test.cpp + streams/strings/split_test.cpp + streams/strings/strings_tests.cpp + STREAM_MODE testing ) -ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) -ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) +ConfigureTest( + STREAM_TEXT_TEST streams/text/ngrams_test.cpp streams/text/replace_test.cpp + streams/text/tokenize_test.cpp STREAM_MODE testing +) +ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index c0109a40cec..624a781c5b9 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -316,6 +316,33 @@ TEST_F(TransformTest, ImbalancedTreeArithmetic) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity); } +TEST_F(TransformTest, ImbalancedTreeArithmeticDeep) +{ + auto c_0 = column_wrapper{4, 5, 6}; + auto table = cudf::table_view{{c_0}}; + + auto col_ref_0 = cudf::ast::column_reference(0); + + // expression: (c0 < c0) == (c0 < (c0 + c0)) + // {false, false, false} == (c0 < {8, 10, 12}) + // {false, false, false} == {true, true, true} + // {false, false, false} + auto expression_left_subtree = + cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_0); + auto expression_right_inner_subtree = + cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_0); + auto expression_right_subtree = + cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_0, expression_right_inner_subtree); + + auto expression_tree = cudf::ast::operation( + cudf::ast::ast_operator::EQUAL, expression_left_subtree, expression_right_subtree); + + auto result = cudf::compute_column(table, expression_tree); + auto expected = column_wrapper{false, false, false}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity); +} + TEST_F(TransformTest, MultiLevelTreeComparator) { auto c_0 = column_wrapper{3, 20, 1, 50}; diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp index c5833f40cf2..612486d8e5c 100644 --- a/cpp/tests/groupby/histogram_tests.cpp +++ b/cpp/tests/groupby/histogram_tests.cpp @@ -67,6 +67,7 @@ auto groupby_histogram(cudf::column_view const& keys, auto sorted_histograms = cudf::lists::sort_lists(cudf::lists_column_view{*sorted_vals}, cudf::order::ASCENDING, cudf::null_order::BEFORE, + cudf::get_default_stream(), rmm::mr::get_current_device_resource()); return std::pair{std::move(sorted_keys), std::move(sorted_histograms)}; diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp index f85fc6335f6..af6f613d344 100644 --- a/cpp/tests/groupby/structs_tests.cpp +++ b/cpp/tests/groupby/structs_tests.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp index fc8f5b37f7e..2c5f7458ce5 100644 --- a/cpp/tests/interop/arrow_utils.hpp +++ b/cpp/tests/interop/arrow_utils.hpp @@ -186,7 +186,7 @@ template auto constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(T); std::shared_ptr arr; - arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -scale), + arrow::Decimal128Builder decimal_builder(arrow::decimal(cudf::detail::max_precision(), -scale), arrow::default_memory_pool()); for (T i = 0; i < static_cast(data.size() / BIT_WIDTH_RATIO); ++i) { diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 6bb4cdfd747..d6762e70d80 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -604,7 +604,9 @@ struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {}; TEST_F(ToArrowDecimalScalarTest, Basic) { auto const value{42}; - auto const precision{18}; // cudf will convert to the widest-precision Arrow scalar of the type + auto const precision = + cudf::detail::max_precision<__int128_t>(); // cudf will convert to the widest-precision Arrow + // scalar of the type int32_t const scale{4}; auto const cudf_scalar = diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu index 3d6743702b8..20b8674a717 100644 --- a/cpp/tests/io/fst/logical_stack_test.cu +++ b/cpp/tests/io/fst/logical_stack_test.cu @@ -216,14 +216,15 @@ TEST_F(LogicalStackTest, GroundTruth) stream.value())); // Run algorithm - fst::sparse_stack_op_to_top_of_stack(d_stack_ops.data(), - d_stack_op_idx_span, - JSONToStackOp{}, - top_of_stack_gpu.device_ptr(), - empty_stack_symbol, - read_symbol, - string_size, - stream.value()); + fst::sparse_stack_op_to_top_of_stack( + d_stack_ops.data(), + d_stack_op_idx_span, + JSONToStackOp{}, + top_of_stack_gpu.device_ptr(), + empty_stack_symbol, + read_symbol, + string_size, + stream.value()); // Async copy results from device to host top_of_stack_gpu.device_to_host_async(stream_view); diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 7c911ac2e04..a2db2d69984 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1422,7 +1423,9 @@ TEST_F(JsonReaderTest, JsonLongString) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); cudf::table_view const expected = tbl_view; std::map types; @@ -1957,12 +1960,36 @@ TEST_F(JsonReaderTest, JSONLinesRecovering) // 2 -> (invalid) R"({"b":{"a":[321})" "\n" - // 3 -> c: [1] (valid) + // 3 -> c: 1.2 (valid) R"({"c":1.2})" "\n" "\n" - // 4 -> a: 123 (valid) - R"({"a":123})"; + // 4 -> a: 4 (valid) + R"({"a":4})" + "\n" + // 5 -> (invalid) + R"({"a":5)" + "\n" + // 6 -> (invalid) + R"({"a":6 )" + "\n" + // 7 -> (invalid) + R"({"b":[7 )" + "\n" + // 8 -> a: 8 (valid) + R"({"a":8})" + "\n" + // 9 -> (invalid) + R"({"d":{"unterminated_field_name)" + "\n" + // 10 -> (invalid) + R"({"d":{)" + "\n" + // 11 -> (invalid) + R"({"d":{"123",)" + "\n" + // 12 -> a: 12 (valid) + R"({"a":12})"; auto filepath = temp_env->get_temp_dir() + "RecoveringLines.json"; { @@ -1978,17 +2005,89 @@ TEST_F(JsonReaderTest, JSONLinesRecovering) cudf::io::table_with_metadata result = cudf::io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 2); - EXPECT_EQ(result.tbl->num_rows(), 5); + EXPECT_EQ(result.tbl->num_rows(), 13); EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); - std::vector a_validity{true, false, false, false, true}; - std::vector c_validity{false, false, false, true, false}; + std::vector a_validity{ + true, false, false, false, true, false, false, false, true, false, false, false, true}; + std::vector c_validity{ + false, false, false, true, false, false, false, false, false, false, false, false, false}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(0), + int64_wrapper{{-2, 0, 0, 0, 4, 0, 0, 0, 8, 0, 0, 0, 12}, a_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), + float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + c_validity.cbegin()}); +} + +TEST_F(JsonReaderTest, JSONLinesRecoveringIgnoreExcessChars) +{ + /** + * @brief Spark has the specific need to ignore extra characters that come after the first record + * on a JSON line + */ + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2}{})" + "\n" + // 1 -> (invalid) + R"({"b":{}should_be_invalid})" + "\n" + // 2 -> b (valid) + R"({"b":{"a":3} })" + "\n" + // 3 -> c: (valid) + R"({"c":1.2 } )" + "\n" + "\n" + // 4 -> (valid) + R"({"a":4} 123)" + "\n" + // 5 -> (valid) + R"({"a":5}//Comment after record)" + "\n" + // 6 -> (valid) + R"({"a":6} //Comment after whitespace)" + "\n" + // 7 -> (invalid) + R"({"a":5 //Invalid Comment within record})"; + + auto filepath = temp_env->get_temp_dir() + "RecoveringLinesExcessChars.json"; + { + std::ofstream outfile(filepath, std::ofstream::out); + outfile << data; + } + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 8); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::FLOAT64); + + std::vector a_validity{true, false, false, false, true, true, true, false}; + std::vector b_validity{false, false, true, false, false, false, false, false}; + std::vector c_validity{false, false, false, true, false, false, false, false}; + + // Child column b->a + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), - int64_wrapper{{-2, 0, 0, 0, 123}, a_validity.cbegin()}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), - float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0}, c_validity.cbegin()}); + int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, a_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), cudf::test::structs_column_wrapper({b_a_col}, b_validity.cbegin())); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(2), + float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0}, c_validity.cbegin()}); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp index 3a4074c02ad..a85a696565b 100644 --- a/cpp/tests/io/json_writer.cpp +++ b/cpp/tests/io/json_writer.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -49,14 +50,16 @@ TEST_F(JsonWriterTest, EmptyInput) .build(); // Empty columns in table - cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); std::string const expected = R"([])"; EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); // Empty columns in table - JSON Lines out_buffer.clear(); out_options.enable_lines(true); - cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); std::string const expected_lines = "\n"; EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); @@ -64,7 +67,8 @@ TEST_F(JsonWriterTest, EmptyInput) cudf::table_view tbl_view2{}; out_options.set_table(tbl_view2); out_buffer.clear(); - cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); } @@ -89,17 +93,22 @@ TEST_F(JsonWriterTest, ErrorCases) .build(); // not enough column names - EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()), - cudf::logic_error); + EXPECT_THROW( + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + cudf::logic_error); mt.schema_info.emplace_back("int16"); out_options.set_metadata(mt); - EXPECT_NO_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource())); + EXPECT_NO_THROW(cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource())); // chunk_rows must be at least 8 out_options.set_rows_per_chunk(0); - EXPECT_THROW(cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()), - cudf::logic_error); + EXPECT_THROW( + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + cudf::logic_error); } TEST_F(JsonWriterTest, PlainTable) @@ -121,7 +130,9 @@ TEST_F(JsonWriterTest, PlainTable) .lines(false) .na_rep("null"); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); std::string const expected = R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])"; @@ -151,7 +162,9 @@ TEST_F(JsonWriterTest, SimpleNested) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} {"a":6,"b":7,"c":{"d":8},"f":10.5} {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} @@ -183,7 +196,9 @@ TEST_F(JsonWriterTest, MixedNested) .lines(false) .na_rep("null"); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); std::string const expected = R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)" R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)" @@ -216,7 +231,8 @@ TEST_F(JsonWriterTest, WriteReadNested) .na_rep("null") .build(); - cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]} {"a":6,"b":7,"c":{"d":8},"f":10.5} {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]} @@ -291,7 +307,8 @@ TEST_F(JsonWriterTest, WriteReadNested) mt.schema_info[2].children.clear(); out_options.set_metadata(mt); out_buffer.clear(); - cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); in_options = cudf::io::json_reader_options::builder( cudf::io::source_info{out_buffer.data(), out_buffer.size()}) @@ -314,7 +331,8 @@ TEST_F(JsonWriterTest, WriteReadNested) // without column names out_options.set_metadata(cudf::io::table_metadata{}); out_buffer.clear(); - cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); in_options = cudf::io::json_reader_options::builder( cudf::io::source_info{out_buffer.data(), out_buffer.size()}) .lines(true) @@ -352,7 +370,8 @@ TEST_F(JsonWriterTest, SpecialChars) .na_rep("null") .build(); - cudf::io::write_json(out_options, rmm::mr::get_current_device_resource()); + cudf::io::write_json( + out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); std::string const expected = R"({"\"a\"":1,"'b'":"abcd"} {"\"a\"":6,"'b'":"b\b\f\n\r\t"} {"\"a\"":1,"'b'":"\"c\""} @@ -385,7 +404,9 @@ TEST_F(JsonWriterTest, NullList) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]} {"a":[2,null,null,3],"b":null} {"a":[null,null,4],"b":[[2,null],null]} @@ -424,7 +445,9 @@ TEST_F(JsonWriterTest, ChunkedNested) .na_rep("null") .rows_per_chunk(8); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); std::string const expected = R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]} {"a":2,"b":-2,"c":{}} @@ -480,7 +503,9 @@ TEST_F(JsonWriterTest, StructAllNullCombinations) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); std::string const expected = R"({} {"e":1} {"d":1} @@ -542,7 +567,9 @@ TEST_F(JsonWriterTest, Unicode) .lines(true) .na_rep("null"); - cudf::io::write_json(options_builder.build(), rmm::mr::get_current_device_resource()); + cudf::io::write_json(options_builder.build(), + cudf::test::get_default_stream(), + rmm::mr::get_current_device_resource()); std::string const expected = R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null} diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 00d657108b8..b0ffbe3d154 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -285,6 +285,123 @@ TEST_F(JsonTest, StackContextRecovering) CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size()); } +TEST_F(JsonTest, StackContextRecoveringFuzz) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + using StackSymbolT = char; + + std::random_device rd; + std::mt19937 gen(42); + std::uniform_int_distribution distribution(0, 4); + constexpr std::size_t input_length = 1024 * 1024; + std::string input{}; + input.reserve(input_length); + + bool inside_quotes = false; + std::stack host_stack{}; + for (std::size_t i = 0; i < input_length; ++i) { + bool is_ok = true; + char current{}; + do { + int rand_char = distribution(gen); + is_ok = true; + switch (rand_char) { + case 0: current = '{'; break; + case 1: current = '['; break; + case 2: current = '}'; break; + case 3: current = '"'; break; + case 4: current = '\n'; break; + } + switch (current) { + case '"': inside_quotes = !inside_quotes; break; + case '{': + if (!inside_quotes) { host_stack.push('{'); } + break; + case '[': + if (!inside_quotes) { host_stack.push('['); } + break; + case '}': + if (!inside_quotes) { + if (host_stack.size() > 0) { + // Get the proper 'pop' stack symbol + current = (host_stack.top() == '{' ? '}' : ']'); + host_stack.pop(); + } else + is_ok = false; + } + break; + case '\n': + // Increase chance to have longer lines + if (distribution(gen) == 0) { + is_ok = false; + break; + } else { + host_stack = {}; + inside_quotes = false; + break; + } + } + } while (!is_ok); + input += current; + } + + std::string expected_stack_context{}; + expected_stack_context.reserve(input_length); + inside_quotes = false; + host_stack = std::stack{}; + for (auto const current : input) { + // Write the stack context for the current input symbol + if (host_stack.empty()) { + expected_stack_context += '_'; + } else { + expected_stack_context += host_stack.top(); + } + + switch (current) { + case '"': inside_quotes = !inside_quotes; break; + case '{': + if (!inside_quotes) { host_stack.push('{'); } + break; + case '[': + if (!inside_quotes) { host_stack.push('['); } + break; + case '}': + if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); } + break; + case ']': + if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); } + break; + case '\n': + host_stack = {}; + inside_quotes = false; + break; + } + } + + // Prepare cuda stream for data transfers & kernels + auto const stream = cudf::get_default_stream(); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = + cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; + cudf::detail::hostdevice_vector stack_context(input.size(), stream); + + // Run algorithm + constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter; + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream); + + // Copy back the results + stack_context.device_to_host_async(stream); + + // Make sure we copied back the stack context + stream.synchronize(); + + ASSERT_EQ(expected_stack_context.size(), stack_context.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL(expected_stack_context, stack_context, stack_context.size()); +} + TEST_F(JsonTest, TokenStream) { using cuio_json::PdaTokenT; @@ -543,7 +660,7 @@ TEST_F(JsonTest, RecoveringTokenStream) { // Test input. Inline comments used to indicate character indexes // 012345678 <= line 0 - std::string const input = R"({"a":-2},)" + std::string const input = R"({"a":2 {})" // 9 "\n" // 01234 <= line 1 @@ -569,23 +686,12 @@ TEST_F(JsonTest, RecoveringTokenStream) // Line 0 (invalid) {0, token_t::StructBegin}, {0, token_t::StructEnd}, - // Line 1 (valid) - {10, token_t::StructBegin}, - {11, token_t::StructMemberBegin}, - {11, token_t::FieldNameBegin}, - {13, token_t::FieldNameEnd}, - // Line 2 (valid) - {16, token_t::StructBegin}, - {17, token_t::StructMemberBegin}, - {17, token_t::FieldNameBegin}, - {19, token_t::FieldNameEnd}, - {21, token_t::StructBegin}, - {22, token_t::StructMemberBegin}, - {22, token_t::FieldNameBegin}, - {24, token_t::FieldNameEnd}, - {26, token_t::ListBegin}, - {27, token_t::ValueBegin}, - {30, token_t::ValueEnd}, + // Line 1 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, // Line 3 (valid) {31, token_t::StructBegin}, {32, token_t::StructMemberBegin}, diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 890ef914713..dca3886db14 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts4.maximum, 3); EXPECT_EQ(*ts4.minimum_utc, -4); EXPECT_EQ(*ts4.maximum_utc, 3); - EXPECT_EQ(*ts4.minimum_nanos, 999994); - EXPECT_EQ(*ts4.maximum_nanos, 6); + // nanosecond precision can't be included until we write a writer version that includes ORC-135 + // see https://github.com/rapidsai/cudf/issues/14325 + // EXPECT_EQ(*ts4.minimum_nanos, 999994); + EXPECT_FALSE(ts4.minimum_nanos.has_value()); + // EXPECT_EQ(*ts4.maximum_nanos, 6); + EXPECT_FALSE(ts4.maximum_nanos.has_value()); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); @@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts5.maximum, 3000); EXPECT_EQ(*ts5.minimum_utc, -3001); EXPECT_EQ(*ts5.maximum_utc, 3000); - EXPECT_EQ(*ts5.minimum_nanos, 994000); - EXPECT_EQ(*ts5.maximum_nanos, 6000); + // nanosecond precision can't be included until we write a writer version that includes ORC-135 + // see https://github.com/rapidsai/cudf/issues/14325 + // EXPECT_EQ(*ts5.minimum_nanos, 994000); + EXPECT_FALSE(ts5.minimum_nanos.has_value()); + // EXPECT_EQ(*ts5.maximum_nanos, 6000); + EXPECT_FALSE(ts5.maximum_nanos.has_value()); auto& s6 = stats[6]; EXPECT_EQ(*s6.number_of_values, 4ul); @@ -1299,20 +1307,16 @@ TEST_F(OrcStatisticsTest, Overflow) TEST_F(OrcStatisticsTest, HasNull) { - // This test can now be implemented with libcudf; keeping the pyorc version to keep the test + // This test can now be implemented with libcudf; keeping the pandas version to keep the test // inputs diversified // Method to create file: - // >>> import pyorc - // >>> output = open("./temp.orc", "wb") - // >>> writer = pyorc.Writer(output, pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt())) - // >>> writer.write((1, 3)) - // >>> writer.write((2, 4)) - // >>> writer.write((None, 5)) - // >>> writer.close() + // >>> import pandas as pd + // >>> df = pd.DataFrame({'a':pd.Series([1, 2, None], dtype="Int64"), 'b':[3, 4, 5]}) + // >>> df.to_orc("temp.orc") // // Contents of file: // >>> import pyarrow.orc as po - // >>> po.ORCFile('new.orc').read() + // >>> po.ORCFile('temp.orc').read() // pyarrow.Table // a: int64 // b: int64 @@ -1934,4 +1938,34 @@ TEST_F(OrcStatisticsTest, AllNulls) check_all_null_stats(stats.file_stats[3]); } +TEST_F(OrcWriterTest, UnorderedDictionary) +{ + std::vector strings{ + "BBBB", "BBBB", "CCCC", "BBBB", "CCCC", "EEEE", "CCCC", "AAAA", "DDDD", "EEEE"}; + str_col col(strings.begin(), strings.end()); + + table_view expected({col}); + + std::vector out_buffer_sorted; + cudf::io::orc_writer_options out_opts_sorted = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_sorted}, expected); + cudf::io::write_orc(out_opts_sorted); + + cudf::io::orc_reader_options in_opts_sorted = cudf::io::orc_reader_options::builder( + cudf::io::source_info{out_buffer_sorted.data(), out_buffer_sorted.size()}); + auto const from_sorted = cudf::io::read_orc(in_opts_sorted).tbl; + + std::vector out_buffer_unsorted; + cudf::io::orc_writer_options out_opts_unsorted = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer_unsorted}, expected) + .enable_dictionary_sort(false); + cudf::io::write_orc(out_opts_unsorted); + + cudf::io::orc_reader_options in_opts_unsorted = cudf::io::orc_reader_options::builder( + cudf::io::source_info{out_buffer_unsorted.data(), out_buffer_unsorted.size()}); + auto const from_unsorted = cudf::io::read_orc(in_opts_unsorted).tbl; + + CUDF_TEST_EXPECT_TABLES_EQUAL(*from_sorted, *from_unsorted); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 81e0e12eeb9..fece83f891b 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -200,29 +200,30 @@ std::unique_ptr make_parquet_list_list_col( // of the file to populate the FileMetaData pointed to by file_meta_data. // throws cudf::logic_error if the file or metadata is invalid. void read_footer(std::unique_ptr const& source, - cudf::io::parquet::FileMetaData* file_meta_data) + cudf::io::parquet::detail::FileMetaData* file_meta_data) { - constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s); - constexpr auto ender_len = sizeof(cudf::io::parquet::file_ender_s); + constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s); + constexpr auto ender_len = sizeof(cudf::io::parquet::detail::file_ender_s); auto const len = source->size(); auto const header_buffer = source->host_read(0, header_len); auto const header = - reinterpret_cast(header_buffer->data()); + reinterpret_cast(header_buffer->data()); auto const ender_buffer = source->host_read(len - ender_len, ender_len); - auto const ender = reinterpret_cast(ender_buffer->data()); + auto const ender = + reinterpret_cast(ender_buffer->data()); // checks for valid header, footer, and file length ASSERT_GT(len, header_len + ender_len); - ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic && - ender->magic == cudf::io::parquet::parquet_magic); + ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic && + ender->magic == cudf::io::parquet::detail::parquet_magic); ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len)); // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1" // seek backwards from the end of the file (footer_length + 8 bytes of ender) auto const footer_buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); - cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); + cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); // returns true on success bool res = cp.read(file_meta_data); @@ -233,14 +234,14 @@ void read_footer(std::unique_ptr const& source, // this assumes the data is uncompressed. // throws cudf::logic_error if the page_loc data is invalid. int read_dict_bits(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); @@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr const& source, // read column index from datasource at location indicated by chunk, // parse and return as a ColumnIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::ColumnIndex read_column_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::ColumnIndex read_column_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index"); CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length"); - cudf::io::parquet::ColumnIndex colidx; + cudf::io::parquet::detail::ColumnIndex colidx; auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); - cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); bool res = cp.read(&colidx); CUDF_EXPECTS(res, "Cannot parse column index"); return colidx; @@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index( // read offset index from datasource at location indicated by chunk, // parse and return as an OffsetIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::OffsetIndex read_offset_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::OffsetIndex read_offset_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index"); CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length"); - cudf::io::parquet::OffsetIndex offidx; + cudf::io::parquet::detail::OffsetIndex offidx; auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); - cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); bool res = cp.read(&offidx); CUDF_EXPECTS(res, "Cannot parse offset index"); return offidx; } // Return as a Statistics from the column chunk -cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::Statistics const& get_statistics( + cudf::io::parquet::detail::ColumnChunk const& chunk) { return chunk.meta_data.statistics; } @@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu // read page header from datasource at location indicated by page_loc, // parse and return as a PageHeader struct. // throws cudf::logic_error if the page_loc data is invalid. -cudf::io::parquet::PageHeader read_page_header(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) +cudf::io::parquet::detail::PageHeader read_page_header( + std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); return page_hdr; @@ -348,6 +353,9 @@ struct ParquetWriterSchemaTest : public ParquetWriterTest { template struct ParquetReaderSourceTest : public ParquetReaderTest {}; +template +struct ParquetWriterDeltaTest : public ParquetWriterTest {}; + // Declare typed test cases // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352 using SupportedTypes = cudf::test::Types; @@ -379,7 +387,6 @@ TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes); class ParquetSizedTest : public ::cudf::test::BaseFixtureWithParam {}; // test the allowed bit widths for dictionary encoding -// values chosen to trigger 1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, and 24 bit dictionaries INSTANTIATE_TEST_SUITE_P(ParquetDictionaryTest, ParquetSizedTest, testing::Range(1, 25), @@ -3686,7 +3693,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3697,7 +3704,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_EQ(ph.data_page_header.num_values, page_rows); } @@ -3722,7 +3729,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3733,7 +3740,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_LE(ph.data_page_header.num_values, rows_per_page); } @@ -3759,7 +3766,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // check that file is written correctly when rows/page < fragment size auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_TRUE(fmd.row_groups.size() > 0); @@ -3770,7 +3777,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); // there should be only one page since the fragment size is larger than rows_per_page EXPECT_EQ(ph.data_page_header.num_values, num_rows); @@ -3798,7 +3805,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4031,7 +4038,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -4041,10 +4048,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::BoundaryOrder expected_orders[] = { - cudf::io::parquet::BoundaryOrder::ASCENDING, - cudf::io::parquet::BoundaryOrder::DESCENDING, - cudf::io::parquet::BoundaryOrder::UNORDERED}; + cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { + cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); @@ -4067,15 +4074,16 @@ int32_t compare(T& v1, T& v2) // 1 if v1 > v2. int32_t compare_binary(std::vector const& v1, std::vector const& v2, - cudf::io::parquet::Type ptype, - cudf::io::parquet::ConvertedType ctype) + cudf::io::parquet::detail::Type ptype, + thrust::optional const& ctype) { + auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); switch (ptype) { - case cudf::io::parquet::INT32: - switch (ctype) { - case cudf::io::parquet::UINT_8: - case cudf::io::parquet::UINT_16: - case cudf::io::parquet::UINT_32: + case cudf::io::parquet::detail::INT32: + switch (ctype_val) { + case cudf::io::parquet::detail::UINT_8: + case cudf::io::parquet::detail::UINT_16: + case cudf::io::parquet::detail::UINT_32: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); default: @@ -4083,23 +4091,23 @@ int32_t compare_binary(std::vector const& v1, *(reinterpret_cast(v2.data()))); } - case cudf::io::parquet::INT64: - if (ctype == cudf::io::parquet::UINT_64) { + case cudf::io::parquet::detail::INT64: + if (ctype_val == cudf::io::parquet::detail::UINT_64) { return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); } return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::FLOAT: + case cudf::io::parquet::detail::FLOAT: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::DOUBLE: + case cudf::io::parquet::detail::DOUBLE: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::BYTE_ARRAY: { + case cudf::io::parquet::detail::BYTE_ARRAY: { int32_t v1sz = v1.size(); int32_t v2sz = v2.size(); int32_t ret = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz)); @@ -4142,7 +4150,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4156,18 +4164,20 @@ TEST_P(ParquetV2Test, LargeColumnIndex) // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); } } } TEST_P(ParquetV2Test, CheckColumnOffsetIndex) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4210,7 +4220,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4237,6 +4247,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; @@ -4245,20 +4258,20 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) EXPECT_FALSE(ci.null_pages[p]); // null_counts should always be 0 EXPECT_EQ(ci.null_counts[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4311,7 +4324,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4339,7 +4352,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) auto const stats = get_statistics(chunk); // should be half nulls, except no nulls in column 0 - EXPECT_EQ(stats.null_count, c == 0 ? 0 : num_rows / 2); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 0 ? 0 : num_rows / 2); // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; @@ -4351,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) } else { EXPECT_EQ(ci.null_counts[p], 0); } - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4362,10 +4378,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4403,7 +4419,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4431,7 +4447,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) auto const stats = get_statistics(chunk); // there should be no nulls except column 1 which is all nulls - EXPECT_EQ(stats.null_count, c == 1 ? num_rows : 0); + if (c != 1) { + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + } + ASSERT_TRUE(stats.null_count.has_value()); + EXPECT_EQ(stats.null_count.value(), c == 1 ? num_rows : 0); // schema indexing starts at 1 auto const ptype = fmd.schema[c + 1].type; @@ -4444,12 +4465,12 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) } if (not ci.null_pages[p]) { EXPECT_EQ(ci.null_counts[p], 0); - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } } for (size_t p = 0; p < ci.max_values.size(); p++) { if (not ci.null_pages[p]) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4458,9 +4479,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto c0 = testdata::ascending(); @@ -4495,7 +4516,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4528,13 +4549,16 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + auto const ptype = fmd.schema[colidx].type; auto const ctype = fmd.schema[colidx].converted_type; for (size_t p = 0; p < ci.min_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.min_value, ci.min_values[p], ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0); } for (size_t p = 0; p < ci.max_values.size(); p++) { - EXPECT_TRUE(compare_binary(stats.max_value, ci.max_values[p], ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0); } } } @@ -4542,9 +4566,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto validity2 = cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); @@ -4586,7 +4610,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4616,9 +4640,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; @@ -4711,7 +4735,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4812,7 +4836,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4824,11 +4848,14 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) auto const ci = read_column_index(source, chunk); auto const stats = get_statistics(chunk); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); // check that truncated values == expected EXPECT_EQ(memcmp(ci.min_values[0].data(), truncated_min[c], ci.min_values[0].size()), 0); @@ -4870,7 +4897,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4885,8 +4912,10 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max auto const ptype = fmd.schema[c + 1].type; auto const ctype = fmd.schema[c + 1].converted_type; - EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value, ptype, ctype) <= 0); - EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value, ptype, ctype) >= 0); + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + EXPECT_TRUE(compare_binary(ci.min_values[0], stats.min_value.value(), ptype, ctype) <= 0); + EXPECT_TRUE(compare_binary(ci.max_values[0], stats.max_value.value(), ptype, ctype) >= 0); // check that truncated values == expected EXPECT_EQ(ci.min_values[0], truncated_min[c]); @@ -5030,10 +5059,10 @@ TEST_F(ParquetReaderTest, NestedByteArray) cudf::io::write_parquet(out_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); std::vector md{ {}, @@ -5081,12 +5110,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats) auto result = cudf::io::read_parquet(in_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY); - EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]); auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]); @@ -5137,9 +5166,9 @@ TEST_F(ParquetReaderTest, StructByteArray) TEST_F(ParquetReaderTest, NestingOptimizationTest) { - // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep. + // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep. constexpr cudf::size_type num_nesting_levels = 16; - static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info); + static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info); constexpr cudf::size_type rows_per_level = 2; constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level; @@ -5206,13 +5235,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5252,13 +5281,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest) // make sure dictionary was not used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5303,13 +5332,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) // make sure dictionary was used as expected. col0 should use one, // col1 should not. auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5354,13 +5383,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest) // make sure dictionary was used for both columns auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5438,13 +5467,13 @@ TEST_P(ParquetSizedTest, DictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -6664,7 +6693,7 @@ TEST_F(ParquetWriterTest, PreserveNullability) TEST_P(ParquetV2Test, CheckEncodings) { - using cudf::io::parquet::Encoding; + using cudf::io::parquet::detail::Encoding; constexpr auto num_rows = 100'000; auto const is_v2 = GetParam(); @@ -6672,7 +6701,7 @@ TEST_P(ParquetV2Test, CheckEncodings) // data should be PLAIN for v1, RLE for V2 auto col0_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 2 == 0; }); - // data should be PLAIN for both + // data should be PLAIN for v1, DELTA_BINARY_PACKED for v2 auto col1_data = random_values(num_rows); // data should be PLAIN_DICTIONARY for v1, PLAIN and RLE_DICTIONARY for v2 auto col2_data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; }); @@ -6697,7 +6726,7 @@ TEST_P(ParquetV2Test, CheckEncodings) }; auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; @@ -6707,10 +6736,10 @@ TEST_P(ParquetV2Test, CheckEncodings) // col0 should have RLE for rep/def and data EXPECT_TRUE(chunk0_enc.size() == 1); EXPECT_TRUE(contains(chunk0_enc, Encoding::RLE)); - // col1 should have RLE for rep/def and PLAIN for data + // col1 should have RLE for rep/def and DELTA_BINARY_PACKED for data EXPECT_TRUE(chunk1_enc.size() == 2); EXPECT_TRUE(contains(chunk1_enc, Encoding::RLE)); - EXPECT_TRUE(contains(chunk1_enc, Encoding::PLAIN)); + EXPECT_TRUE(contains(chunk1_enc, Encoding::DELTA_BINARY_PACKED)); // col2 should have RLE for rep/def, PLAIN for dict, and RLE_DICTIONARY for data EXPECT_TRUE(chunk2_enc.size() == 3); EXPECT_TRUE(contains(chunk2_enc, Encoding::RLE)); @@ -6732,4 +6761,212 @@ TEST_P(ParquetV2Test, CheckEncodings) } } +// removing duration_D, duration_s, and timestamp_s as they don't appear to be supported properly. +// see definition of UnsupportedChronoTypes above. +using DeltaDecimalTypes = cudf::test::Types; +using DeltaBinaryTypes = + cudf::test::Concat; +using SupportedDeltaTestTypes = + cudf::test::RemoveIf, DeltaBinaryTypes>; +TYPED_TEST_SUITE(ParquetWriterDeltaTest, SupportedDeltaTestTypes); + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypes) +{ + using T = TypeParam; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPacked.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaTestTypesSliced) +{ + using T = TypeParam; + constexpr int num_rows = 4'000; + auto col0 = testdata::ascending(); + auto col1 = testdata::unordered(); + + auto const expected = table_view{{col0, col1}}; + auto expected_slice = cudf::slice(expected, {num_rows, 2 * num_rows}); + ASSERT_EQ(expected_slice[0].num_rows(), num_rows); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + +TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) +{ + using T = TypeParam; + + constexpr int num_slice = 4'000; + constexpr int num_rows = 32 * 1024; + + std::mt19937 gen(6542); + std::bernoulli_distribution bn(0.7f); + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); }); + auto values = thrust::make_counting_iterator(0); + + // list + constexpr int vals_per_row = 4; + auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, + c1_offset_iter + num_rows + 1); + cudf::test::fixed_width_column_wrapper c1_vals( + values, values + (num_rows * vals_per_row), valids); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows); + + auto _c1 = cudf::make_lists_column( + num_rows, c1_offsets.release(), c1_vals.release(), null_count, std::move(null_mask)); + auto c1 = cudf::purge_nonempty_nulls(*_c1); + + auto const expected = table_view{{*c1}}; + auto expected_slice = cudf::slice(expected, {num_slice, 2 * num_slice}); + ASSERT_EQ(expected_slice[0].num_rows(), num_slice); + + auto const filepath = temp_env->get_temp_filepath("DeltaBinaryPackedListSliced.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice) + .write_v2_headers(true) + .dictionary_policy(cudf::io::dictionary_policy::NEVER); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto result = cudf::io::read_parquet(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view()); +} + +TEST_F(ParquetWriterTest, EmptyMinStringStatistics) +{ + char const* const min_val = ""; + char const* const max_val = "zzz"; + std::vector strings{min_val, max_val, "pining", "for", "the", "fjords"}; + + column_wrapper string_col{strings.begin(), strings.end()}; + auto const output = table_view{{string_col}}; + auto const filepath = temp_env->get_temp_filepath("EmptyMinStringStatistics.parquet"); + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output); + cudf::io::write_parquet(out_opts); + + auto const source = cudf::io::datasource::create(filepath); + cudf::io::parquet::detail::FileMetaData fmd; + read_footer(source, &fmd); + + ASSERT_TRUE(fmd.row_groups.size() > 0); + ASSERT_TRUE(fmd.row_groups[0].columns.size() > 0); + auto const& chunk = fmd.row_groups[0].columns[0]; + auto const stats = get_statistics(chunk); + + ASSERT_TRUE(stats.min_value.has_value()); + ASSERT_TRUE(stats.max_value.has_value()); + auto const min_value = std::string{reinterpret_cast(stats.min_value.value().data()), + stats.min_value.value().size()}; + auto const max_value = std::string{reinterpret_cast(stats.max_value.value().data()), + stats.max_value.value().size()}; + EXPECT_EQ(min_value, std::string(min_val)); + EXPECT_EQ(max_value, std::string(max_val)); +} + +TEST_F(ParquetReaderTest, RepeatedNoAnnotations) +{ + constexpr unsigned char repeated_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, + 0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02, + 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15, + 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15, + 0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01, + 0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00, + 0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15, + 0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02, + 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02, + 0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00, + 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d, + 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10, + 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c, + 0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69, + 0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04, + 0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c, + 0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19, + 0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, + 0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda, + 0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, + 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, + 0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16, + 0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75, + 0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39, + 0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38, + 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + auto read_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); + auto result = cudf::io::read_parquet(read_opts); + + EXPECT_EQ(result.tbl->view().column(0).size(), 6); + EXPECT_EQ(result.tbl->view().num_columns(), 2); + + column_wrapper col0{1, 2, 3, 4, 5, 6}; + column_wrapper child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}}; + cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}}; + auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}}; + + auto list_offsets_column = + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 1, 2, 5}.release(); + auto num_list_rows = list_offsets_column->size() - 1; + + auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 0, 2, false); + + auto list_col = cudf::make_lists_column( + num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask)); + + std::vector> struct_children; + struct_children.push_back(std::move(list_col)); + + auto outer_struct = + cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}}; + table_view expected{{col0, outer_struct}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu index 1ff7f4c42a5..0c10853ec02 100644 --- a/cpp/tests/iterator/indexalator_test.cu +++ b/cpp/tests/iterator/indexalator_test.cu @@ -20,9 +20,13 @@ #include +#include +#include #include #include #include +#include +#include using TestingTypes = cudf::test::IntegralTypesNotBool; @@ -94,3 +98,62 @@ TYPED_TEST(IndexalatorTest, optional_iterator) auto it_dev = cudf::detail::indexalator_factory::make_input_optional_iterator(d_col); this->iterator_test_thrust(expected_values, it_dev, host_values.size()); } + +template +struct transform_fn { + __device__ cudf::size_type operator()(Integer v) + { + return static_cast(v) + static_cast(v); + } +}; + +TYPED_TEST(IndexalatorTest, output_iterator) +{ + using T = TypeParam; + + auto d_col1 = + cudf::test::fixed_width_column_wrapper({0, 6, 7, 14, 23, 33, 43, 45, 63}); + auto d_col2 = + cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0}); + auto itr = cudf::detail::indexalator_factory::make_output_iterator(d_col2); + auto input = cudf::column_view(d_col1); + auto stream = cudf::get_default_stream(); + + auto map = cudf::test::fixed_width_column_wrapper({0, 2, 4, 6, 8, 1, 3, 5, 7}); + auto d_map = cudf::column_view(map); + thrust::gather( + rmm::exec_policy_nosync(stream), d_map.begin(), d_map.end(), input.begin(), itr); + auto expected = + cudf::test::fixed_width_column_wrapper({0, 7, 23, 43, 63, 6, 14, 33, 45}); + thrust::scatter( + rmm::exec_policy_nosync(stream), input.begin(), input.end(), d_map.begin(), itr); + expected = + cudf::test::fixed_width_column_wrapper({0, 33, 6, 43, 7, 45, 14, 63, 23}); + + thrust::transform( + rmm::exec_policy(stream), input.begin(), input.end(), itr, transform_fn{}); + expected = + cudf::test::fixed_width_column_wrapper({0, 12, 14, 28, 46, 66, 86, 90, 126}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77); + expected = + cudf::test::fixed_width_column_wrapper({77, 77, 77, 77, 77, 77, 77, 77, 77}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size()); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 2, 3, 4, 5, 6, 7, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + auto indices = + cudf::test::fixed_width_column_wrapper({0, 10, 20, 30, 40, 50, 60, 70, 80}); + auto d_indices = cudf::column_view(indices); + thrust::lower_bound(rmm::exec_policy(stream), + d_indices.begin(), + d_indices.end(), + input.begin(), + input.end(), + itr); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 2, 3, 4, 5, 5, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); +} diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu new file mode 100644 index 00000000000..e569e58f42a --- /dev/null +++ b/cpp/tests/iterator/offsetalator_test.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using TestingTypes = cudf::test::Types; + +template +struct OffsetalatorTest : public IteratorTest {}; + +TYPED_TEST_SUITE(OffsetalatorTest, TestingTypes); + +TYPED_TEST(OffsetalatorTest, input_iterator) +{ + using T = TypeParam; + + auto host_values = cudf::test::make_type_param_vector({0, 6, 0, -14, 13, 64, -13, -20, 45}); + + auto d_col = cudf::test::fixed_width_column_wrapper(host_values.begin(), host_values.end()); + + auto expected_values = thrust::host_vector(host_values.size()); + std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) { + return static_cast(v); + }); + + auto it_dev = cudf::detail::offsetalator_factory::make_input_iterator(d_col); + this->iterator_test_thrust(expected_values, it_dev, host_values.size()); +} + +TYPED_TEST(OffsetalatorTest, output_iterator) +{ + using T = TypeParam; + + auto d_col1 = cudf::test::fixed_width_column_wrapper({0, 6, 7, 14, 23, 33, 43, 45, 63}); + auto d_col2 = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0}); + auto itr = cudf::detail::offsetalator_factory::make_output_iterator(d_col2); + auto input = cudf::column_view(d_col1); + auto stream = cudf::get_default_stream(); + + auto map = cudf::test::fixed_width_column_wrapper({0, 2, 4, 6, 8, 1, 3, 5, 7}); + auto d_map = cudf::column_view(map); + thrust::gather(rmm::exec_policy_nosync(stream), + d_map.begin(), + d_map.end(), + input.begin(), + itr); + auto expected = cudf::test::fixed_width_column_wrapper({0, 7, 23, 43, 63, 6, 14, 33, 45}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::scatter(rmm::exec_policy_nosync(stream), + input.begin(), + input.end(), + d_map.begin(), + itr); + expected = cudf::test::fixed_width_column_wrapper({0, 33, 6, 43, 7, 45, 14, 63, 23}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77); + expected = cudf::test::fixed_width_column_wrapper({77, 77, 77, 77, 77, 77, 77, 77, 77}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size()); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 2, 3, 4, 5, 6, 7, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + auto offsets = + cudf::test::fixed_width_column_wrapper({0, 10, 20, 30, 40, 50, 60, 70, 80}); + auto d_offsets = cudf::column_view(offsets); + thrust::lower_bound(rmm::exec_policy(stream), + d_offsets.begin(), + d_offsets.end(), + input.begin(), + input.end(), + itr); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 2, 3, 4, 5, 5, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); +} + +namespace { +/** + * For testing creating and using the offsetalator in device code. + */ +struct device_functor_fn { + cudf::column_device_view const d_col; + __device__ int32_t operator()(int idx) + { + auto const itr = cudf::detail::input_offsetalator(d_col.head(), d_col.type()); + return static_cast(itr[idx] * 3); + } +}; +} // namespace + +TYPED_TEST(OffsetalatorTest, device_offsetalator) +{ + using T = TypeParam; + + auto d_col1 = cudf::test::fixed_width_column_wrapper({0, 6, 7, 14, 23, 33, 43, 45, 63}); + auto d_col2 = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0}); + auto input = cudf::column_view(d_col1); + auto output = cudf::mutable_column_view(d_col2); + auto stream = cudf::get_default_stream(); + + auto d_input = cudf::column_device_view::create(input, stream); + + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + output.begin(), + device_functor_fn{*d_input}); + + auto expected = + cudf::test::fixed_width_column_wrapper({0, 18, 21, 42, 69, 99, 129, 135, 189}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); +} diff --git a/cpp/tests/jit/parse_ptx_function.cpp b/cpp/tests/jit/parse_ptx_function.cpp new file mode 100644 index 00000000000..5f00c5f561a --- /dev/null +++ b/cpp/tests/jit/parse_ptx_function.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +struct JitParseTest : public ::testing::Test {}; + +TEST_F(JitParseTest, PTXNoFunction) +{ + std::string raw_ptx = R"( +.visible .entry _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv() +{ + ret; +})"; + + EXPECT_THROW(cudf::jit::parse_single_function_ptx(raw_ptx, "GENERIC_OP", "float", {0}), + cudf::logic_error); +} + +inline bool ptx_equal(std::string input, std::string expected) +{ + // Remove all whitespace and newline characters and compare + // This allows us to handle things like excess newline characters + // and trailing whitespace in the 'input' + + auto whitespace_or_newline = [](unsigned char c) { return std::isspace(c) || c == '\n'; }; + input.erase(std::remove_if(input.begin(), input.end(), whitespace_or_newline), input.end()); + expected.erase(std::remove_if(expected.begin(), expected.end(), whitespace_or_newline), + expected.end()); + return input == expected; +} + +TEST_F(JitParseTest, SimplePTX) +{ + std::string raw_ptx = R"( +.visible .func (.param .b32 func_retval0) _ZN8__main__7add$241Eff( + .param .b64 _ZN8__main__7add$241Eff_param_0, + .param .b32 _ZN8__main__7add$241Eff_param_1, + .param .b32 _ZN8__main__7add$241Eff_param_2 +) +{ + ret; +} +)"; + + std::string expected = R"( +__device__ __inline__ void GENERIC_OP( + float* _ZN8__main__7add_241Eff_param_0, + int _ZN8__main__7add_241Eff_param_1, + int _ZN8__main__7add_241Eff_param_2 +){ + asm volatile ("{"); + asm volatile ("bra RETTGT;"); + asm volatile ("RETTGT:}");} +)"; + + std::string cuda_source = + cudf::jit::parse_single_function_ptx(raw_ptx, "GENERIC_OP", "float", {0}); + + EXPECT_TRUE(ptx_equal(cuda_source, expected)); +} + +TEST_F(JitParseTest, PTXWithPragma) +{ + std::string raw_ptx = R"( +.visible .func _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv() +{ +$L__BB0_151: + .pragma "nounroll"; + mov.u32 % r1517, % r1516; + mov.u32 % r1516, % r1515; + mov.u32 % r1515, % r1505; + mov.u32 % r1457, 0; +$L__BB0_152: + .pragma "nounroll"; +})"; + + std::string expected = R"( +__device__ __inline__ void EmptyKern(){ + asm volatile ("{"); asm volatile (" $L__BB0_151: .pragma \"nounroll\";"); + /** $L__BB0_151: + .pragma "nounroll" */ + + asm volatile (" mov.u32 _ r1517, _ r1516;"); + /** mov.u32 % r1517, % r1516 */ + + asm volatile (" mov.u32 _ r1516, _ r1515;"); + /** mov.u32 % r1516, % r1515 */ + + asm volatile (" mov.u32 _ r1515, _ r1505;"); + /** mov.u32 % r1515, % r1505 */ + + asm volatile (" mov.u32 _ r1457, 0;"); + /** mov.u32 % r1457, 0 */ + + asm volatile (" $L__BB0_152: .pragma \"nounroll\";"); + /** $L__BB0_152: + .pragma "nounroll" */ + + asm volatile ("RETTGT:}");} +)"; + + std::string cuda_source = cudf::jit::parse_single_function_ptx(raw_ptx, "EmptyKern", "void", {0}); + EXPECT_TRUE(ptx_equal(cuda_source, expected)); +} + +TEST_F(JitParseTest, PTXWithPragmaWithSpaces) +{ + std::string raw_ptx = R"( +.visible .func _ZN3cub17CUB_101702_750_NS11EmptyKernelIvEEvv() +{ + $L__BB0_58: + ld.param.u32 % r1419, [% rd419 + 80]; + setp.ne.s32 % p394, % r1419, 22; + mov.u32 % r2050, 0; + mov.u32 % r2048, % r2050; + @ % p394 bra $L__BB0_380; + + ld.param.u8 % rs1369, [% rd419 + 208]; + setp.eq.s16 % p395, % rs1369, 0; + selp.b32 % r1422, % r1925, 0, % p395; + ld.param.u32 % r1423, [% rd419 + 112]; + add.s32 % r427, % r1422, % r1423; + ld.param.u64 % rd1249, [% rd419 + 120]; + cvta.to.global.u64 % rd1250, % rd1249; + .pragma "used_bytes_mask 4095"; + ld.global.v4.u32{ % r1424, % r1425, % r1426, % r1427}, [% rd1250]; + ld.global.v2.u64{ % rd1251, % rd1252}, [% rd1250 + 16]; + ld.global.s32 % rd230, [% rd1250 + 32]; + setp.gt.s32 % p396, % r1424, 6; + @ % p396 bra $L__BB0_376; +} +} +)"; + + std::string expected = R"( +__device__ __inline__ void LongKernel(){ + asm volatile ("{"); asm volatile (" $L__BB0_58: cvt.u32.u32 _ %0, [_ rd419 + 80];": : "r"(r1419)); + /** $L__BB0_58: + ld.param.u32 % r1419, [% rd419 + 80] */ + + asm volatile (" setp.ne.s32 _ p394, _ r1419, 22;"); + /** setp.ne.s32 % p394, % r1419, 22 */ + + asm volatile (" mov.u32 _ r2050, 0;"); + /** mov.u32 % r2050, 0 */ + + asm volatile (" mov.u32 _ r2048, _ r2050;"); + /** mov.u32 % r2048, % r2050 */ + + asm volatile (" @ _ p394 bra $L__BB0_380;"); + /** @ % p394 bra $L__BB0_380 */ + + asm volatile (" cvt.u8.u8 _ %0, [_ rd419 + 208];": : "h"( static_cast(rs1369))); + /** ld.param.u8 % rs1369, [% rd419 + 208] */ + + asm volatile (" setp.eq.s16 _ p395, _ rs1369, 0;"); + /** setp.eq.s16 % p395, % rs1369, 0 */ + + asm volatile (" selp.b32 _ r1422, _ r1925, 0, _ p395;"); + /** selp.b32 % r1422, % r1925, 0, % p395 */ + + asm volatile (" cvt.u32.u32 _ %0, [_ rd419 + 112];": : "r"(r1423)); + /** ld.param.u32 % r1423, [% rd419 + 112] */ + + asm volatile (" add.s32 _ r427, _ r1422, _ r1423;"); + /** add.s32 % r427, % r1422, % r1423 */ + + asm volatile (" mov.u64 _ %0, [_ rd419 + 120];": : "l"(rd1249)); + /** ld.param.u64 % rd1249, [% rd419 + 120] */ + + asm volatile (" cvta.to.global.u64 _ rd1250, _ rd1249;"); + /** cvta.to.global.u64 % rd1250, % rd1249 */ + + asm volatile (" .pragma \"used_bytes_mask 4095\";"); + /** .pragma "used_bytes_mask 4095" */ + + asm volatile (" ld.global.v4.u32{ _ r1424, _ r1425, _ r1426, _ r1427}, [_ rd1250];"); + /** ld.global.v4.u32{ % r1424, % r1425, % r1426, % r1427}, [% rd1250] */ + + asm volatile (" ld.global.v2.u64{ _ rd1251, _ rd1252}, [_ rd1250 + 16];"); + /** ld.global.v2.u64{ % rd1251, % rd1252}, [% rd1250 + 16] */ + + asm volatile (" ld.global.s32 _ rd230, [_ rd1250 + 32];"); + /** ld.global.s32 % rd230, [% rd1250 + 32] */ + + asm volatile (" setp.gt.s32 _ p396, _ r1424, 6;"); + /** setp.gt.s32 % p396, % r1424, 6 */ + + asm volatile (" @ _ p396 bra $L__BB0_376;"); + /** @ % p396 bra $L__BB0_376 */ + + asm volatile ("RETTGT:}");} + )"; + + std::string cuda_source = + cudf::jit::parse_single_function_ptx(raw_ptx, "LongKernel", "void", {0}); + EXPECT_TRUE(ptx_equal(cuda_source, expected)); +} + +CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 089db315748..a416df0c7c3 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -1941,62 +1941,6 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } -TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown) -{ - // When joining on a STRUCT column, if the parent nulls are not reflected in - // the children, the join might produce incorrect results. - // - // In this test, a fact table of structs is joined against a dimension table. - // Both tables must match (only) on the NULL row. This will fail if the fact table's - // nulls are not pushed down into its children. - using ints = column_wrapper; - using structs = cudf::test::structs_column_wrapper; - using namespace cudf::test::iterators; - - auto make_table = [](auto&& col) { - auto columns = CVector{}; - columns.push_back(std::move(col)); - return cudf::table{std::move(columns)}; - }; - - auto const fact_table = [make_table] { - auto fact_ints = ints{0, 1, 2, 3, 4}; - auto fact_structs = structs{{fact_ints}, no_nulls()}.release(); - // Now set struct validity to invalidate index#3. - cudf::detail::set_null_mask( - fact_structs->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream()); - // Struct row#3 is null, but Struct.child has a non-null value. - return make_table(std::move(fact_structs)); - }(); - - auto const dimension_table = [make_table] { - auto dim_ints = ints{999}; - auto dim_structs = structs{{dim_ints}, null_at(0)}; - return make_table(dim_structs.release()); - }(); - - auto const result = inner_join(fact_table.view(), dimension_table.view(), {0}, {0}); - EXPECT_EQ(result->num_rows(), 1); // The null STRUCT rows should match. - - // Note: Join result might not have nulls pushed down, since it's an output of gather(). - // Must superimpose parent nulls before comparisons. - auto [superimposed_results, _] = cudf::structs::detail::push_down_nulls( - *result, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - - auto const expected = [] { - auto fact_ints = ints{0}; - auto fact_structs = structs{{fact_ints}, null_at(0)}; - auto dim_ints = ints{0}; - auto dim_structs = structs{{dim_ints}, null_at(0)}; - auto columns = CVector{}; - columns.push_back(fact_structs.release()); - columns.push_back(dim_structs.release()); - return cudf::table{std::move(columns)}; - }(); - - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(superimposed_results, expected); -} - using lcw = cudf::test::lists_column_wrapper; using cudf::test::iterators::null_at; diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/json/json_tests.cpp similarity index 84% rename from cpp/tests/strings/json_tests.cpp rename to cpp/tests/json/json_tests.cpp index d74bb9258fa..a03880eef5d 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/json/json_tests.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ +#include #include -#include #include #include @@ -85,7 +85,7 @@ TEST_F(JsonPathTests, GetJsonObjectRootOp) // root cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); auto expected = drop_whitespace(input); @@ -98,7 +98,7 @@ TEST_F(JsonPathTests, GetJsonObjectChildOp) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); // clang-format off @@ -147,7 +147,7 @@ TEST_F(JsonPathTests, GetJsonObjectChildOp) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); // clang-format off @@ -193,7 +193,7 @@ TEST_F(JsonPathTests, GetJsonObjectWildcardOp) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.*"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); // clang-format off @@ -242,7 +242,7 @@ TEST_F(JsonPathTests, GetJsonObjectWildcardOp) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("*"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); // clang-format off @@ -297,7 +297,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[2]"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); // clang-format off @@ -319,7 +319,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store['bicycle']"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); // clang-format off @@ -338,7 +338,7 @@ TEST_F(JsonPathTests, GetJsonObjectSubscriptOp) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[*]"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); // clang-format off @@ -387,7 +387,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[*]['isbn']"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw{R"(["0-553-21311-3","0-395-19395-8"])"}; @@ -399,7 +399,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[*].category"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw{ @@ -412,7 +412,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[*].title"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw{ @@ -425,7 +425,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book.*.price"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"}; @@ -440,7 +440,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter) // spark: fiction cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[2].category"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw{"fiction"}; @@ -457,7 +457,7 @@ TEST_F(JsonPathTests, GetJsonObjectNullInputs) cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0}); std::string json_path("$.a"); - auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path); auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); @@ -473,7 +473,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyQuery) { cudf::test::strings_column_wrapper input{R"({"a" : "b"})"}; std::string json_path(""); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::strings_column_wrapper expected({""}, {0}); @@ -487,7 +487,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs) { cudf::test::strings_column_wrapper input{""}; std::string json_path("$"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::strings_column_wrapper expected({""}, {0}); @@ -500,7 +500,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs) { cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"}; std::string json_path("$.store.bicycle"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::strings_column_wrapper expected({""}, {1}); @@ -512,7 +512,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInput) { cudf::test::strings_column_wrapper input{}; std::string json_path("$"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, input); } @@ -525,7 +525,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$$"); auto query = [&]() { - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); }; EXPECT_THROW(query(), cudf::logic_error); } @@ -535,7 +535,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[auh46h-]"); auto query = [&]() { - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); }; EXPECT_THROW(query(), cudf::logic_error); } @@ -545,7 +545,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[[]]"); auto query = [&]() { - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); }; EXPECT_THROW(query(), cudf::logic_error); } @@ -555,7 +555,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[-1]"); auto query = [&]() { - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); }; EXPECT_THROW(query(), cudf::logic_error); } @@ -565,7 +565,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("."); auto query = [&]() { - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); }; EXPECT_THROW(query(), std::invalid_argument); } @@ -574,7 +574,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("]["); auto query = [&]() { - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); }; EXPECT_THROW(query(), std::invalid_argument); } @@ -583,7 +583,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery) cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("6hw6,56i3"); auto query = [&]() { - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); }; EXPECT_THROW(query(), std::invalid_argument); } @@ -596,7 +596,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery) { cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[*].c"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::strings_column_wrapper expected({""}, {0}); @@ -607,7 +607,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery) { cudf::test::strings_column_wrapper input{R"({"a": "b"})"}; std::string json_path("$[*].c[2]"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::strings_column_wrapper expected({""}, {0}); @@ -618,7 +618,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book.price"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::strings_column_wrapper expected({""}, {0}); @@ -629,7 +629,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery) { cudf::test::strings_column_wrapper input{json_string}; std::string json_path("$.store.book[4]"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); cudf::test::strings_column_wrapper expected({""}, {0}); @@ -672,7 +672,7 @@ TEST_F(JsonPathTests, MixedOutput) cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); { std::string json_path("$.a"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -694,7 +694,7 @@ TEST_F(JsonPathTests, MixedOutput) { std::string json_path("$.a[1]"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -713,7 +713,7 @@ TEST_F(JsonPathTests, MixedOutput) { std::string json_path("$.a.b"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -731,7 +731,7 @@ TEST_F(JsonPathTests, MixedOutput) { std::string json_path("$.a[*]"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -752,7 +752,7 @@ TEST_F(JsonPathTests, MixedOutput) { std::string json_path("$.a.b[*]"); - auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -779,13 +779,12 @@ TEST_F(JsonPathTests, StripQuotes) std::string str("{\"a\" : \"b\"}"); cudf::test::strings_column_wrapper input({str, str}); - cudf::strings::get_json_object_options options; + cudf::get_json_object_options options; options.set_strip_quotes_from_single_strings(false); std::string json_path("$.a"); - auto result_raw = - cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); - auto result = drop_whitespace(*result_raw); + auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); + auto result = drop_whitespace(*result_raw); cudf::test::strings_column_wrapper expected_raw({"\"b\"", "\"b\""}); auto expected = drop_whitespace(expected_raw); @@ -798,11 +797,10 @@ TEST_F(JsonPathTests, StripQuotes) cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"}; std::string json_path("$.store.bicycle"); - cudf::strings::get_json_object_options options; + cudf::get_json_object_options options; options.set_strip_quotes_from_single_strings(true); - auto result = - cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); cudf::test::strings_column_wrapper expected({""}); @@ -859,11 +857,10 @@ TEST_F(JsonPathTests, AllowSingleQuotes) { std::string json_path("$.a"); - cudf::strings::get_json_object_options options; + cudf::get_json_object_options options; options.set_allow_single_quotes(true); - auto result = - cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -903,11 +900,10 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::string json_path("$.item"); - cudf::strings::get_json_object_options options; + cudf::get_json_object_options options; options.set_allow_single_quotes(true); - auto result = - cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -929,11 +925,10 @@ TEST_F(JsonPathTests, StringsWithSpecialChars) { std::string json_path("$.a"); - cudf::strings::get_json_object_options options; + cudf::get_json_object_options options; options.set_allow_single_quotes(true); - auto result = - cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -962,11 +957,10 @@ TEST_F(JsonPathTests, EscapeSequences) { std::string json_path("$.a"); - cudf::strings::get_json_object_options options; + cudf::get_json_object_options options; options.set_allow_single_quotes(true); - auto result = - cudf::strings::get_json_object(cudf::strings_column_view(input), json_path, options); + auto result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options); // clang-format off cudf::test::strings_column_wrapper expected({ @@ -998,12 +992,12 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls) auto const& missing_fields_output, bool default_valid = true) { cudf::test::strings_column_wrapper input{input_string}; - cudf::strings::get_json_object_options options; + cudf::get_json_object_options options; // Test default behavior options.set_missing_fields_as_nulls(false); auto const default_result = - cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options); + cudf::get_json_object(cudf::strings_column_view(input), {json_path_string}, options); cudf::test::strings_column_wrapper default_expected({default_output}, {default_valid}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(default_expected, *default_result); @@ -1011,7 +1005,7 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls) // Test with missing fields as null options.set_missing_fields_as_nulls(true); auto const missing_fields_result = - cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options); + cudf::get_json_object(cudf::strings_column_view(input), {json_path_string}, options); cudf::test::strings_column_wrapper missing_fields_expected({missing_fields_output}, {1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(missing_fields_expected, *missing_fields_result); diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 3a61c0768a6..3558e5676dd 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -27,7 +27,9 @@ #include #include #include +#include #include +#include #include #include @@ -874,6 +876,117 @@ TEST_F(MergeTest, StructsNestedWithNulls) // clang-format on } +using lcw = cudf::test::lists_column_wrapper; +using cudf::test::iterators::null_at; +using cudf::test::iterators::nulls_at; + +TEST_F(MergeTest, Lists) +{ + auto col1 = lcw{lcw{1}, lcw{3}, lcw{5}, lcw{7}}; + auto col2 = lcw{lcw{2}, lcw{4}, lcw{6}, lcw{8}}; + + auto tbl1 = cudf::table_view{{col1}}; + auto tbl2 = cudf::table_view{{col2}}; + + auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING}); + + auto expected_col = lcw{lcw{1}, lcw{2}, lcw{3}, lcw{4}, lcw{5}, lcw{6}, lcw{7}, lcw{8}}; + auto expected_tbl = cudf::table_view{{expected_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result); +} + +TEST_F(MergeTest, NestedListsWithNulls) +{ + auto col1 = lcw{{lcw{lcw{1}}, lcw{lcw{3}}, lcw{lcw{5}}, lcw{lcw{7}}}, null_at(3)}; + auto col2 = lcw{{lcw{lcw{2}}, lcw{lcw{4}}, lcw{lcw{6}}, lcw{lcw{8}}}, null_at(3)}; + + auto tbl1 = cudf::table_view{{col1}}; + auto tbl2 = cudf::table_view{{col2}}; + + auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER}); + + auto expected_col = lcw{{lcw{lcw{1}}, + lcw{lcw{2}}, + lcw{lcw{3}}, + lcw{lcw{4}}, + lcw{lcw{5}}, + lcw{lcw{6}}, + lcw{lcw{7}}, + lcw{lcw{8}}}, + nulls_at({6, 7})}; + auto expected_tbl = cudf::table_view{{expected_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result); +} + +TEST_F(MergeTest, NestedListsofStructs) +{ + // [ {1}, {2}, {3} ] + // [ {5} ] + // [ {7}, {8} ] + // [ {10} ] + auto const col1 = [] { + auto const get_structs = [] { + auto child0 = cudf::test::fixed_width_column_wrapper{1, 2, 3, 5, 7, 8, 10}; + return cudf::test::structs_column_wrapper{{child0}}; + }; + return cudf::make_lists_column( + 4, + cudf::test::fixed_width_column_wrapper{0, 3, 4, 6, 7}.release(), + get_structs().release(), + 0, + {}); + }(); + + // [ {4} ] + // [ {6} ] + // [ {9} ] + // [ {11} ] + auto const col2 = [] { + auto const get_structs = [] { + auto child0 = cudf::test::fixed_width_column_wrapper{4, 6, 9, 11}; + return cudf::test::structs_column_wrapper{{child0}}; + }; + return cudf::make_lists_column( + 4, + cudf::test::fixed_width_column_wrapper{0, 1, 2, 3, 4}.release(), + get_structs().release(), + 0, + {}); + }(); + + auto tbl1 = cudf::table_view{{*col1}}; + auto tbl2 = cudf::table_view{{*col2}}; + + auto result = cudf::merge({tbl1, tbl2}, {0}, {cudf::order::ASCENDING}, {cudf::null_order::AFTER}); + + // [ {1}, {2}, {3} ] + // [ {4} ] + // [ {5} ] + // [ {6} ] + // [ {7}, {8} ] + // [ {9} ] + // [ {10} ] + // [ {11} ] + auto const expected_col = [] { + auto const get_structs = [] { + auto child0 = + cudf::test::fixed_width_column_wrapper{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; + return cudf::test::structs_column_wrapper{{child0}}; + }; + return cudf::make_lists_column( + 8, + cudf::test::fixed_width_column_wrapper{0, 3, 4, 5, 6, 8, 9, 10, 11}.release(), + get_structs().release(), + 0, + {}); + }(); + auto expected_tbl = cudf::table_view{{*expected_col}}; + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_tbl, *result); +} + template struct FixedPointTestAllReps : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp new file mode 100644 index 00000000000..2520aed0458 --- /dev/null +++ b/cpp/tests/streams/binaryop_test.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +class BinaryopTest : public cudf::test::BaseFixture {}; + +TEST_F(BinaryopTest, ColumnColumn) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +TEST_F(BinaryopTest, ColumnScalar) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::numeric_scalar rhs{23, true, cudf::test::get_default_stream()}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +TEST_F(BinaryopTest, ScalarColumn) +{ + cudf::numeric_scalar lhs{42, true, cudf::test::get_default_stream()}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +class BinaryopPTXTest : public BinaryopTest { + protected: + void SetUp() override + { + if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; } + } +}; + +TEST_F(BinaryopPTXTest, ColumnColumnPTX) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + // c = a*a*a + b*b + char const* ptx = + R"***( +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-24817639 +// Cuda compilation tools, release 10.0, V10.0.130 +// Based on LLVM 3.4svn +// + +.version 6.3 +.target sm_70 +.address_size 64 + + // .globl _ZN8__main__7add$241Eix +.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Eix; +.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Exx; + +.visible .func (.param .b32 func_retval0) _ZN8__main__7add$241Eix( + .param .b64 _ZN8__main__7add$241Eix_param_0, + .param .b32 _ZN8__main__7add$241Eix_param_1, + .param .b64 _ZN8__main__7add$241Eix_param_2 +) +{ + .reg .b32 %r<3>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [_ZN8__main__7add$241Eix_param_0]; + ld.param.u32 %r1, [_ZN8__main__7add$241Eix_param_1]; + ld.param.u64 %rd2, [_ZN8__main__7add$241Eix_param_2]; + cvt.s64.s32 %rd3, %r1; + mul.wide.s32 %rd4, %r1, %r1; + mul.lo.s64 %rd5, %rd4, %rd3; + mul.lo.s64 %rd6, %rd2, %rd2; + add.s64 %rd7, %rd6, %rd5; + st.u64 [%rd1], %rd7; + mov.u32 %r2, 0; + st.param.b32 [func_retval0+0], %r2; + ret; +} + +)***"; + + cudf::binary_operation( + lhs, rhs, ptx, cudf::data_type(cudf::type_to_id()), cudf::test::get_default_stream()); + cudf::binary_operation(lhs, rhs, ptx, cudf::data_type(cudf::type_to_id())); +} diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp new file mode 100644 index 00000000000..88514fa412c --- /dev/null +++ b/cpp/tests/streams/io/csv_test.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +class CSVTest : public cudf::test::BaseFixture {}; + +TEST_F(CSVTest, CSVWriter) +{ + constexpr auto num_rows = 10; + + std::vector zeros(num_rows, 0); + std::vector ones(num_rows, 1); + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{12}}; + }); + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{-12}}; + }); + + cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col2(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col6(col6_data, col6_data + num_rows); + cudf::test::fixed_width_column_wrapper col7(col7_data, col7_data + num_rows); + + std::vector col8_data(num_rows, "rapids"); + cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end()); + + cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv"; + auto w_options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab) + .include_header(false) + .inter_column_delimiter(','); + cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream()); +} + +TEST_F(CSVTest, CSVReader) +{ + constexpr auto num_rows = 10; + + std::vector zeros(num_rows, 0); + std::vector ones(num_rows, 1); + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{12}}; + }); + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{-12}}; + }); + + cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col2(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col6(col6_data, col6_data + num_rows); + cudf::test::fixed_width_column_wrapper col7(col7_data, col7_data + num_rows); + + std::vector col8_data(num_rows, "rapids"); + cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end()); + + cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv"; + auto w_options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab) + .include_header(false) + .inter_column_delimiter(','); + cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp new file mode 100644 index 00000000000..80619d4d58c --- /dev/null +++ b/cpp/tests/streams/io/json_test.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +class JSONTest : public cudf::test::BaseFixture {}; + +TEST_F(JSONTest, JSONreader) +{ + std::string data = "[1, 1.1]\n[2, 2.2]\n[3, 3.3]\n"; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(std::vector{cudf::data_type{cudf::type_id::INT32}, + cudf::data_type{cudf::type_id::FLOAT64}}) + .lines(true) + .legacy(true); + cudf::io::table_with_metadata result = + cudf::io::read_json(in_options, cudf::test::get_default_stream()); +} + +TEST_F(JSONTest, JSONwriter) +{ + cudf::test::strings_column_wrapper col1{"a", "b", "c"}; + cudf::test::strings_column_wrapper col2{"d", "e", "f"}; + cudf::test::fixed_width_column_wrapper col3{1, 2, 3}; + cudf::test::fixed_width_column_wrapper col4{1.5, 2.5, 3.5}; + cudf::test::fixed_width_column_wrapper col5{{1, 2, 3}, + cudf::test::iterators::nulls_at({0, 2})}; + cudf::table_view tbl_view{{col1, col2, col3, col4, col5}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"int"}, {"float"}, {"int16"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto options_builder = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null"); + + cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp new file mode 100644 index 00000000000..74e0e8837f7 --- /dev/null +++ b/cpp/tests/streams/lists_test.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class ListTest : public cudf::test::BaseFixture {}; + +TEST_F(ListTest, ConcatenateRows) +{ + cudf::test::lists_column_wrapper list_col_1{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_2{{0, 1}, {2, 3}, {4, 5}}; + cudf::table_view lists_table({list_col_1, list_col_2}); + cudf::lists::concatenate_rows( + lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ConcatenateListElements) +{ + cudf::test::lists_column_wrapper ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}; + cudf::lists::concatenate_list_elements( + ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsNulls) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::index_of(list_col, + search_key, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::index_of(list_col, + search_keys, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, CountElements) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7}, {4, 5}}; + cudf::lists::count_elements(list_col, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ExtractListElementFromIndex) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7}, {4, 5}}; + cudf::lists::extract_list_element(list_col, -1, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ExtractListElementFromIndices) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7}, {4, 5}}; + cudf::test::fixed_width_column_wrapper indices({-1, -2, -1}); + cudf::lists::extract_list_element(list_col, indices, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, SegmentedGather) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::test::lists_column_wrapper gather_map_list{{0}, {1, 2}, {1}}; + cudf::lists::segmented_gather(list_col, + gather_map_list, + cudf::out_of_bounds_policy::DONT_CHECK, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, Sequences) +{ + cudf::test::fixed_width_column_wrapper starts({0, 1, 2, 3, 4}); + cudf::test::fixed_width_column_wrapper sizes({0, 1, 2, 2, 1}); + cudf::lists::sequences(starts, sizes, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, SequencesWithSteps) +{ + cudf::test::fixed_width_column_wrapper starts({0, 1, 2, 3, 4}); + cudf::test::fixed_width_column_wrapper steps({2, 1, 1, 1, -3}); + cudf::test::fixed_width_column_wrapper sizes({0, 1, 2, 2, 1}); + cudf::lists::sequences(starts, steps, sizes, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, Reverse) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::lists::reverse(list_col, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, SortLists) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::lists::sort_lists( + list_col, cudf::order::DESCENDING, cudf::null_order::AFTER, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, StableSortLists) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::lists::stable_sort_lists( + list_col, cudf::order::DESCENDING, cudf::null_order::AFTER, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ApplyBooleanMask) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::test::lists_column_wrapper boolean_mask{{0, 1}, {1, 1, 1, 0}, {0, 1}}; + cudf::lists::apply_boolean_mask(list_col, boolean_mask, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, Distinct) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::test::lists_column_wrapper boolean_mask{{0, 1}, {1, 1, 1, 0}, {0, 1}}; + cudf::lists::distinct(list_col, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, DifferenceDistinct) +{ + cudf::test::lists_column_wrapper list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_b{{0, 1}, {1, 3, 6, 8}, {5}}; + cudf::lists::difference_distinct(list_col_a, + list_col_b, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IntersectDistinct) +{ + cudf::test::lists_column_wrapper list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_b{{0, 1}, {1, 3, 6, 8}, {5}}; + cudf::lists::intersect_distinct(list_col_a, + list_col_b, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, UnionDistinct) +{ + cudf::test::lists_column_wrapper list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_b{{0, 1}, {1, 3, 6, 8}, {5}}; + cudf::lists::union_distinct(list_col_a, + list_col_b, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, HaveOverlap) +{ + cudf::test::lists_column_wrapper list_col_a{{0, 1}, {2, 3, 7, 8}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_b{{0, 1}, {1, 3, 6, 8}, {5}}; + cudf::lists::have_overlap(list_col_a, + list_col_b, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL, + cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/null_mask_test.cpp b/cpp/tests/streams/null_mask_test.cpp new file mode 100644 index 00000000000..7e59201c8cf --- /dev/null +++ b/cpp/tests/streams/null_mask_test.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +class NullMaskTest : public cudf::test::BaseFixture {}; + +TEST_F(NullMaskTest, CreateNullMask) +{ + cudf::create_null_mask(10, cudf::mask_state::ALL_VALID, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, SetNullMask) +{ + cudf::test::fixed_width_column_wrapper col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::set_null_mask(static_cast(col).null_mask(), + 0, + 3, + false, + cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, CopyBitmask) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::copy_bitmask( + static_cast(col).null_mask(), 0, 3, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, CopyBitmaskFromColumn) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + + cudf::copy_bitmask(col, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, BitMaskAnd) +{ + cudf::test::fixed_width_column_wrapper const col1({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + cudf::test::fixed_width_column_wrapper const col2({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + auto tbl = cudf::table_view{{col1, col2}}; + cudf::bitmask_and(tbl, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, BitMaskOr) +{ + cudf::test::fixed_width_column_wrapper const col1({0, 1, 0, 1, 1}, + {true, false, true, false, false}); + cudf::test::fixed_width_column_wrapper const col2({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + auto tbl = cudf::table_view{{col1, col2}}; + cudf::bitmask_or(tbl, cudf::test::get_default_stream()); +} + +TEST_F(NullMaskTest, NullCount) +{ + cudf::test::fixed_width_column_wrapper const col({0, 1, 0, 1, 1}, + {true, true, false, false, true}); + + cudf::null_count( + static_cast(col).null_mask(), 0, 4, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/combine_test.cpp b/cpp/tests/streams/strings/combine_test.cpp new file mode 100644 index 00000000000..9562634957a --- /dev/null +++ b/cpp/tests/streams/strings/combine_test.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class StringsCombineTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsCombineTest, Concatenate) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"}); + auto view = cudf::table_view({input, input}); + + auto separators = cudf::test::strings_column_wrapper({"_", ".", " "}); + auto separators_view = cudf::strings_column_view(separators); + auto sep_on_null = cudf::strings::separator_on_nulls::YES; + + auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + auto const narep = cudf::string_scalar("n/a", true, cudf::test::get_default_stream()); + cudf::strings::concatenate(view, separator, narep, sep_on_null, cudf::test::get_default_stream()); + cudf::strings::concatenate( + view, separators_view, narep, narep, sep_on_null, cudf::test::get_default_stream()); +} + +TEST_F(StringsCombineTest, Join) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"}); + auto view = cudf::strings_column_view(input); + + auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + auto const narep = cudf::string_scalar("n/a", true, cudf::test::get_default_stream()); + cudf::strings::join_strings(view, separator, narep, cudf::test::get_default_stream()); +} + +TEST_F(StringsCombineTest, JoinLists) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + auto const input = STR_LISTS{ + STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{"ddd", "efgh", "ijk"}, STR_LISTS{"zzz", "xxxxx"}}; + auto view = cudf::lists_column_view(input); + + auto separators = cudf::test::strings_column_wrapper({"_", ".", " "}); + auto separators_view = cudf::strings_column_view(separators); + auto sep_on_null = cudf::strings::separator_on_nulls::YES; + auto if_empty = cudf::strings::output_if_empty_list::EMPTY_STRING; + + auto const separator = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + auto const narep = cudf::string_scalar("n/a", true, cudf::test::get_default_stream()); + cudf::strings::join_list_elements( + view, separator, narep, sep_on_null, if_empty, cudf::test::get_default_stream()); + cudf::strings::join_list_elements( + view, separators_view, narep, narep, sep_on_null, if_empty, cudf::test::get_default_stream()); +} + +TEST_F(StringsCombineTest, Repeat) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést"}); + auto view = cudf::strings_column_view(input); + cudf::strings::repeat_strings(view, 0, cudf::test::get_default_stream()); + cudf::strings::repeat_strings(view, 1, cudf::test::get_default_stream()); + cudf::strings::repeat_strings(view, 10, cudf::test::get_default_stream()); + + auto counts = cudf::test::fixed_width_column_wrapper({9, 8, 7}); + cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream()); + cudf::strings::repeat_strings(view, counts, cudf::test::get_default_stream()); + + auto const str = cudf::string_scalar("X", true, cudf::test::get_default_stream()); + cudf::strings::repeat_string(str, 0, cudf::test::get_default_stream()); + cudf::strings::repeat_string(str, 1, cudf::test::get_default_stream()); + cudf::strings::repeat_string(str, 10, cudf::test::get_default_stream()); + + auto const invalid = cudf::string_scalar("", false, cudf::test::get_default_stream()); + cudf::strings::repeat_string(invalid, 10, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/contains_test.cpp b/cpp/tests/streams/strings/contains_test.cpp new file mode 100644 index 00000000000..383d48abe1e --- /dev/null +++ b/cpp/tests/streams/strings/contains_test.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class StringsContainsTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsContainsTest, Contains) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const pattern = std::string("[a-z]"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::contains_re(view, *prog, cudf::test::get_default_stream()); + cudf::strings::matches_re(view, *prog, cudf::test::get_default_stream()); + cudf::strings::count_re(view, *prog, cudf::test::get_default_stream()); +} + +TEST_F(StringsContainsTest, Like) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesés", "tést", ""}); + auto view = cudf::strings_column_view(input); + + auto const pattern = cudf::string_scalar("%és", true, cudf::test::get_default_stream()); + auto const escape = cudf::string_scalar("%", true, cudf::test::get_default_stream()); + cudf::strings::like(view, pattern, escape, cudf::test::get_default_stream()); + + auto const patterns = cudf::test::strings_column_wrapper({"H%", "t%s", "t", ""}); + cudf::strings::like( + view, cudf::strings_column_view(patterns), escape, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/convert_test.cpp b/cpp/tests/streams/strings/convert_test.cpp new file mode 100644 index 00000000000..8dc3f625746 --- /dev/null +++ b/cpp/tests/streams/strings/convert_test.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +class StringsConvertTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsConvertTest, Booleans) +{ + auto input = cudf::test::strings_column_wrapper({"true", "false", "True", ""}); + auto view = cudf::strings_column_view(input); + + auto true_scalar = cudf::string_scalar("true", true, cudf::test::get_default_stream()); + auto false_scalar = cudf::string_scalar("false", true, cudf::test::get_default_stream()); + + auto bools = cudf::strings::to_booleans(view, true_scalar, cudf::test::get_default_stream()); + cudf::strings::from_booleans( + bools->view(), true_scalar, false_scalar, cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, Timestamps) +{ + auto input = cudf::test::strings_column_wrapper({"2019-03-20T12:34:56Z", "2020-02-29T00:00:00Z"}); + auto view = cudf::strings_column_view(input); + + std::string format = "%Y-%m-%dT%H:%M:%SZ"; + auto dtype = cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}; + + cudf::strings::is_timestamp(view, format, cudf::test::get_default_stream()); + auto timestamps = + cudf::strings::to_timestamps(view, dtype, format, cudf::test::get_default_stream()); + + auto empty = cudf::test::strings_column_wrapper(); + cudf::strings::from_timestamps( + timestamps->view(), format, cudf::strings_column_view(empty), cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, Durations) +{ + auto input = cudf::test::strings_column_wrapper({"17975 days 12:34:56", "18321 days 00:00:00"}); + auto view = cudf::strings_column_view(input); + + std::string format = "%D days %H:%M:%S"; + auto dtype = cudf::data_type{cudf::type_id::DURATION_SECONDS}; + + auto durations = + cudf::strings::to_durations(view, dtype, format, cudf::test::get_default_stream()); + cudf::strings::from_durations(durations->view(), format, cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, FixedPoint) +{ + auto input = cudf::test::strings_column_wrapper({"1.234E3", "-876", "543.2"}); + auto view = cudf::strings_column_view(input); + + auto dtype = cudf::data_type{cudf::type_id::DECIMAL64, numeric::scale_type{-3}}; + + auto values = cudf::strings::to_fixed_point(view, dtype, cudf::test::get_default_stream()); + cudf::strings::from_fixed_point(values->view(), cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, Floats) +{ + auto input = cudf::test::strings_column_wrapper({"1.234E3", "-876", "543.2"}); + auto view = cudf::strings_column_view(input); + + auto dtype = cudf::data_type{cudf::type_id::FLOAT32}; + + auto values = cudf::strings::to_floats(view, dtype, cudf::test::get_default_stream()); + cudf::strings::from_floats(values->view(), cudf::test::get_default_stream()); + cudf::strings::is_float(view, cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, Integers) +{ + auto input = cudf::test::strings_column_wrapper({"1234", "-876", "5432"}); + auto view = cudf::strings_column_view(input); + + auto dtype = cudf::data_type{cudf::type_id::INT32}; + + auto values = cudf::strings::to_integers(view, dtype, cudf::test::get_default_stream()); + cudf::strings::from_integers(values->view(), cudf::test::get_default_stream()); + cudf::strings::is_integer(view, cudf::test::get_default_stream()); + cudf::strings::is_hex(view, cudf::test::get_default_stream()); + cudf::strings::hex_to_integers(view, dtype, cudf::test::get_default_stream()); + cudf::strings::integers_to_hex(values->view(), cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, IPv4) +{ + auto input = cudf::test::strings_column_wrapper({"192.168.0.1", "10.0.0.1"}); + auto view = cudf::strings_column_view(input); + + auto values = cudf::strings::ipv4_to_integers(view, cudf::test::get_default_stream()); + cudf::strings::integers_to_ipv4(values->view(), cudf::test::get_default_stream()); + cudf::strings::is_ipv4(view, cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, URLs) +{ + auto input = cudf::test::strings_column_wrapper({"www.nvidia.com/rapids?p=é", "/_file-7.txt"}); + auto view = cudf::strings_column_view(input); + + auto values = cudf::strings::url_encode(view, cudf::test::get_default_stream()); + cudf::strings::url_decode(values->view(), cudf::test::get_default_stream()); +} + +TEST_F(StringsConvertTest, ListsFormat) +{ + using STR_LISTS = cudf::test::lists_column_wrapper; + auto const input = + STR_LISTS{{STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{}, STR_LISTS{"ddd", "ee", "f"}}, + {STR_LISTS{"gg", "hhh"}, STR_LISTS{"i", "", "", "jj"}}}; + auto view = cudf::lists_column_view(input); + auto null_scalar = cudf::string_scalar("NULL", true, cudf::test::get_default_stream()); + auto separators = cudf::strings_column_view(cudf::test::strings_column_wrapper()); + cudf::strings::format_list_column( + view, null_scalar, separators, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/extract_test.cpp b/cpp/tests/streams/strings/extract_test.cpp new file mode 100644 index 00000000000..06570fc5b38 --- /dev/null +++ b/cpp/tests/streams/strings/extract_test.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class StringsExtractTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsExtractTest, Extract) +{ + auto input = cudf::test::strings_column_wrapper({"Joe Schmoe", "John Smith", "Jane Smith"}); + auto view = cudf::strings_column_view(input); + + auto const pattern = std::string("([A-Z][a-z]+)"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::extract(view, *prog, cudf::test::get_default_stream()); + cudf::strings::extract_all_record(view, *prog, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/filter_test.cpp b/cpp/tests/streams/strings/filter_test.cpp new file mode 100644 index 00000000000..3c44eb81380 --- /dev/null +++ b/cpp/tests/streams/strings/filter_test.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +#include +#include + +class StringsFilterTest : public cudf::test::BaseFixture {}; + +static std::pair make_entry(char const* from, char const* to) +{ + cudf::char_utf8 in = 0; + cudf::char_utf8 out = 0; + cudf::strings::detail::to_char_utf8(from, in); + if (to) cudf::strings::detail::to_char_utf8(to, out); + return std::pair(in, out); +} + +TEST_F(StringsFilterTest, Translate) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + std::vector> translate_table{ + make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")}; + cudf::strings::translate(view, translate_table, cudf::test::get_default_stream()); +} + +TEST_F(StringsFilterTest, Filter) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + std::vector> filter_table{ + make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")}; + + auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream()); + auto const keep = cudf::strings::filter_type::KEEP; + cudf::strings::filter_characters( + view, filter_table, keep, repl, cudf::test::get_default_stream()); +} + +TEST_F(StringsFilterTest, FilterTypes) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + auto const verify_types = + cudf::strings::string_character_types::LOWER | cudf::strings::string_character_types::UPPER; + auto const all_types = cudf::strings::string_character_types::ALL_TYPES; + cudf::strings::all_characters_of_type( + view, verify_types, all_types, cudf::test::get_default_stream()); + + auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream()); + auto const space_types = cudf::strings::string_character_types::SPACE; + cudf::strings::filter_characters_of_type( + view, all_types, repl, space_types, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/replace_test.cpp b/cpp/tests/streams/strings/replace_test.cpp new file mode 100644 index 00000000000..fc87460b706 --- /dev/null +++ b/cpp/tests/streams/strings/replace_test.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +#include + +class StringsReplaceTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsReplaceTest, Replace) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream()); + auto const repl = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + cudf::strings::replace(view, target, repl, -1, cudf::test::get_default_stream()); + cudf::strings::replace(view, view, view, cudf::test::get_default_stream()); + cudf::strings::replace_slice(view, repl, 1, 2, cudf::test::get_default_stream()); + + auto const pattern = std::string("[a-z]"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream()); + + cudf::test::strings_column_wrapper repls({"1", "a", " "}); + cudf::strings::replace_re(view, + {pattern, pattern, pattern}, + cudf::strings_column_view(repls), + cudf::strings::regex_flags::DEFAULT, + cudf::test::get_default_stream()); +} + +TEST_F(StringsReplaceTest, ReplaceRegex) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo", "thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const repl = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + auto const pattern = std::string("[a-z]"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::replace_re(view, *prog, repl, 1, cudf::test::get_default_stream()); + + cudf::test::strings_column_wrapper repls({"1", "a", " "}); + cudf::strings::replace_re(view, + {pattern, pattern, pattern}, + cudf::strings_column_view(repls), + cudf::strings::regex_flags::DEFAULT, + cudf::test::get_default_stream()); +} + +TEST_F(StringsReplaceTest, ReplaceRegexBackref) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings"}); + auto view = cudf::strings_column_view(input); + + auto const repl_template = std::string("\\2-\\1"); + auto const pattern = std::string("(\\w) (\\w)"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::replace_with_backrefs( + view, *prog, repl_template, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/reverse_test.cpp b/cpp/tests/streams/strings/reverse_test.cpp new file mode 100644 index 00000000000..83dcf24594e --- /dev/null +++ b/cpp/tests/streams/strings/reverse_test.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include + +class StringsReverseTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsReverseTest, Reverse) +{ + auto input = cudf::test::strings_column_wrapper({"aBcdef", " ", "12345"}); + auto view = cudf::strings_column_view(input); + + cudf::strings::reverse(view, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/split_test.cpp b/cpp/tests/streams/strings/split_test.cpp new file mode 100644 index 00000000000..24247f6f79c --- /dev/null +++ b/cpp/tests/streams/strings/split_test.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include + +class StringsSplitTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsSplitTest, SplitPartition) +{ + auto input = cudf::test::strings_column_wrapper({"Héllo thesé", "tést strings", ""}); + auto view = cudf::strings_column_view(input); + + auto const delimiter = cudf::string_scalar("é", true, cudf::test::get_default_stream()); + cudf::strings::split(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::split_record(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_record(view, delimiter, -1, cudf::test::get_default_stream()); + cudf::strings::partition(view, delimiter, cudf::test::get_default_stream()); + cudf::strings::rpartition(view, delimiter, cudf::test::get_default_stream()); + + auto const pattern = std::string("\\s"); + auto const prog = cudf::strings::regex_program::create(pattern); + cudf::strings::split_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::split_record_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_re(view, *prog, -1, cudf::test::get_default_stream()); + cudf::strings::rsplit_record_re(view, *prog, -1, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/strings/strings_tests.cpp b/cpp/tests/streams/strings/strings_tests.cpp new file mode 100644 index 00000000000..0db467a6895 --- /dev/null +++ b/cpp/tests/streams/strings/strings_tests.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +class StringsTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsTest, Strip) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + auto const strip = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + auto const side = cudf::strings::side_type::BOTH; + cudf::strings::strip(view, side, strip, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Pad) +{ + auto input = cudf::test::strings_column_wrapper({"333", "", "4444", "1"}); + auto view = cudf::strings_column_view(input); + + auto const side = cudf::strings::side_type::BOTH; + cudf::strings::pad(view, 6, side, " ", cudf::test::get_default_stream()); + cudf::strings::zfill(view, 6, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Wrap) +{ + auto input = cudf::test::strings_column_wrapper({"the quick brown fox jumped"}); + auto view = cudf::strings_column_view(input); + + cudf::strings::wrap(view, 6, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Slice) +{ + auto input = cudf::test::strings_column_wrapper({"hello", "these", "are test strings"}); + auto view = cudf::strings_column_view(input); + + auto start = cudf::numeric_scalar(2, true, cudf::test::get_default_stream()); + auto stop = cudf::numeric_scalar(5, true, cudf::test::get_default_stream()); + auto step = cudf::numeric_scalar(1, true, cudf::test::get_default_stream()); + cudf::strings::slice_strings(view, start, stop, step, cudf::test::get_default_stream()); + + auto starts = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto stops = cudf::test::fixed_width_column_wrapper({4, 5, 6}); + cudf::strings::slice_strings(view, starts, stops, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/text/replace_test.cpp b/cpp/tests/streams/text/replace_test.cpp new file mode 100644 index 00000000000..7617f886f9d --- /dev/null +++ b/cpp/tests/streams/text/replace_test.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +class TextReplaceTest : public cudf::test::BaseFixture {}; + +TEST_F(TextReplaceTest, Replace) +{ + auto const input = cudf::test::strings_column_wrapper({"the fox jumped over the dog"}); + auto const targets = cudf::test::strings_column_wrapper({"the", "dog"}); + auto const repls = cudf::test::strings_column_wrapper({"_", ""}); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + nvtext::replace_tokens(cudf::strings_column_view(input), + cudf::strings_column_view(targets), + cudf::strings_column_view(repls), + delimiter, + cudf::test::get_default_stream()); +} + +TEST_F(TextReplaceTest, Filter) +{ + auto const input = cudf::test::strings_column_wrapper({"one two three", "four five six"}); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + auto const repl = cudf::string_scalar{"_", true, cudf::test::get_default_stream()}; + nvtext::filter_tokens( + cudf::strings_column_view(input), 1, delimiter, repl, cudf::test::get_default_stream()); +} + +TEST_F(TextReplaceTest, NormalizeSpaces) +{ + auto input = + cudf::test::strings_column_wrapper({"the\tquick brown\nfox", "jumped\rover the lazy\r\t\n"}); + nvtext::normalize_spaces(cudf::strings_column_view(input), cudf::test::get_default_stream()); +} + +TEST_F(TextReplaceTest, NormalizeCharacters) +{ + auto input = cudf::test::strings_column_wrapper({"abc£def", "éè â îô\taeio", "\tĂĆĖÑ Ü"}); + nvtext::normalize_characters( + cudf::strings_column_view(input), false, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/text/tokenize_test.cpp b/cpp/tests/streams/text/tokenize_test.cpp new file mode 100644 index 00000000000..b281fbc2c0c --- /dev/null +++ b/cpp/tests/streams/text/tokenize_test.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +class TextTokenizeTest : public cudf::test::BaseFixture {}; + +TEST_F(TextTokenizeTest, Tokenize) +{ + auto const input = cudf::test::strings_column_wrapper({"the fox jumped", "over thé dog"}); + auto const view = cudf::strings_column_view(input); + auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + nvtext::tokenize(view, delimiter, cudf::test::get_default_stream()); + nvtext::count_tokens(view, delimiter, cudf::test::get_default_stream()); + auto const delimiters = cudf::test::strings_column_wrapper({" ", "o", "é"}); + nvtext::tokenize(view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream()); + nvtext::count_tokens( + view, cudf::strings_column_view(delimiters), cudf::test::get_default_stream()); +} + +TEST_F(TextTokenizeTest, CharacterTokenize) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + nvtext::character_tokenize(cudf::strings_column_view(input), cudf::test::get_default_stream()); +} + +TEST_F(TextTokenizeTest, Detokenize) +{ + auto const input = + cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"}); + auto const view = cudf::strings_column_view(input); + auto const indices = cudf::test::fixed_width_column_wrapper({0, 0, 0, 1, 1, 1}); + auto const separator = cudf::string_scalar{" ", true, cudf::test::get_default_stream()}; + nvtext::detokenize(view, indices, separator, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/unary_test.cpp b/cpp/tests/streams/unary_test.cpp new file mode 100644 index 00000000000..1734c0c4e9f --- /dev/null +++ b/cpp/tests/streams/unary_test.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +class UnaryTest : public cudf::test::BaseFixture {}; + +TEST_F(UnaryTest, UnaryOperation) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::unary_operation(column, cudf::unary_operator::ABS, cudf::test::get_default_stream()); +} + +TEST_F(UnaryTest, IsNull) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::is_null(column, cudf::test::get_default_stream()); +} + +TEST_F(UnaryTest, IsValid) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::is_valid(column, cudf::test::get_default_stream()); +} + +TEST_F(UnaryTest, Cast) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::cast(column, cudf::data_type{cudf::type_id::INT64}, cudf::test::get_default_stream()); +} + +TEST_F(UnaryTest, IsNan) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::is_nan(column, cudf::test::get_default_stream()); +} + +TEST_F(UnaryTest, IsNotNan) +{ + cudf::test::fixed_width_column_wrapper const column{10, 20, 30, 40, 50}; + + cudf::is_not_nan(column, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/strings/booleans_tests.cpp b/cpp/tests/strings/booleans_tests.cpp index 0c7fc992065..469ca77a4c5 100644 --- a/cpp/tests/strings/booleans_tests.cpp +++ b/cpp/tests/strings/booleans_tests.cpp @@ -36,7 +36,8 @@ TEST_F(StringsConvertTest, ToBooleans) thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); auto strings_view = cudf::strings_column_view(strings); - auto results = cudf::strings::to_booleans(strings_view); + auto true_scalar = cudf::string_scalar("true"); + auto results = cudf::strings::to_booleans(strings_view, true_scalar); std::vector h_expected{false, false, false, true, false, false}; cudf::test::fixed_width_column_wrapper expected( @@ -60,26 +61,46 @@ TEST_F(StringsConvertTest, FromBooleans) h_column.end(), thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - auto results = cudf::strings::from_booleans(column); + auto true_scalar = cudf::string_scalar("true"); + auto false_scalar = cudf::string_scalar("false"); + auto results = cudf::strings::from_booleans(column, true_scalar, false_scalar); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } TEST_F(StringsConvertTest, ZeroSizeStringsColumnBoolean) { auto const zero_size_column = cudf::make_empty_column(cudf::type_id::BOOL8)->view(); - auto results = cudf::strings::from_booleans(zero_size_column); + auto true_scalar = cudf::string_scalar("true"); + auto false_scalar = cudf::string_scalar("false"); + auto results = cudf::strings::from_booleans(zero_size_column, true_scalar, false_scalar); cudf::test::expect_column_empty(results->view()); } TEST_F(StringsConvertTest, ZeroSizeBooleansColumn) { auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view(); - auto results = cudf::strings::to_booleans(zero_size_strings_column); + auto true_scalar = cudf::string_scalar("true"); + auto results = cudf::strings::to_booleans(zero_size_strings_column, true_scalar); EXPECT_EQ(0, results->size()); } TEST_F(StringsConvertTest, BooleanError) { - auto column = cudf::make_numeric_column(cudf::data_type{cudf::type_id::INT32}, 100); - EXPECT_THROW(cudf::strings::from_booleans(column->view()), cudf::logic_error); + auto int_column = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto true_scalar = cudf::string_scalar("true"); + auto false_scalar = cudf::string_scalar("false"); + EXPECT_THROW(cudf::strings::from_booleans(int_column, true_scalar, false_scalar), + cudf::logic_error); + + auto bool_column = cudf::test::fixed_width_column_wrapper({1, 0, 1}); + auto null_scalar = cudf::string_scalar("", false); + EXPECT_THROW(cudf::strings::from_booleans(bool_column, null_scalar, false_scalar), + cudf::logic_error); + EXPECT_THROW(cudf::strings::from_booleans(bool_column, true_scalar, null_scalar), + cudf::logic_error); + auto empty_scalar = cudf::string_scalar("", true); + EXPECT_THROW(cudf::strings::from_booleans(int_column, empty_scalar, false_scalar), + cudf::logic_error); + EXPECT_THROW(cudf::strings::from_booleans(int_column, true_scalar, empty_scalar), + cudf::logic_error); } diff --git a/cpp/tests/strings/format_lists_tests.cpp b/cpp/tests/strings/format_lists_tests.cpp index 95dc9725afc..6196b8ed6ad 100644 --- a/cpp/tests/strings/format_lists_tests.cpp +++ b/cpp/tests/strings/format_lists_tests.cpp @@ -60,8 +60,9 @@ TEST_F(StringsFormatListsTest, WithNulls) cudf::test::iterators::null_at(1)}; auto const view = cudf::lists_column_view(input); - auto results = cudf::strings::format_list_column(view); - auto expected = cudf::test::strings_column_wrapper( + auto null_scalar = cudf::string_scalar("NULL"); + auto results = cudf::strings::format_list_column(view, null_scalar); + auto expected = cudf::test::strings_column_wrapper( {"[a,NULL,ccc]", "NULL", "[NULL,bb,ddd]", "[zzz,xxxxx]", "[v,,NULL,w]"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -132,11 +133,13 @@ TEST_F(StringsFormatListsTest, SlicedLists) "[ééé,12345abcdef]", "[www,12345]"}); + auto null_scalar = cudf::string_scalar("NULL"); + // set of slice intervals: covers slicing the front, back, and middle std::vector> index_pairs({{0, 11}, {0, 4}, {3, 8}, {5, 11}}); for (auto indexes : index_pairs) { auto sliced = cudf::lists_column_view(cudf::slice(input, {indexes.first, indexes.second})[0]); - auto results = cudf::strings::format_list_column(sliced); + auto results = cudf::strings::format_list_column(sliced, null_scalar); auto expected = cudf::test::strings_column_wrapper(h_expected.begin() + indexes.first, h_expected.begin() + indexes.second); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index 59805f9cb6d..c8f292f55b2 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -456,3 +456,29 @@ TEST_F(StringsConvertTest, IntegerToHexWithNull) auto results = cudf::strings::integers_to_hex(integers); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } + +TEST_F(StringsConvertTest, IntegerConvertErrors) +{ + cudf::test::fixed_width_column_wrapper bools( + {true, true, false, false, true, true, false, true}); + cudf::test::fixed_width_column_wrapper floats( + {123456.0, -1.0, 0.0, 0.0, 12.0, 12345.0, 123456789.0}); + EXPECT_THROW(cudf::strings::integers_to_hex(bools), cudf::logic_error); + EXPECT_THROW(cudf::strings::integers_to_hex(floats), cudf::logic_error); + EXPECT_THROW(cudf::strings::from_integers(bools), cudf::logic_error); + EXPECT_THROW(cudf::strings::from_integers(floats), cudf::logic_error); + + auto input = cudf::test::strings_column_wrapper({"123456", "-1", "0"}); + auto view = cudf::strings_column_view(input); + EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::BOOL8)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::FLOAT32)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::TIMESTAMP_SECONDS)), + cudf::logic_error); + EXPECT_THROW( + cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::DURATION_MILLISECONDS)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::to_integers(view, cudf::data_type(cudf::type_id::DECIMAL32)), + cudf::logic_error); +} diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index f143983aded..f04bb832f09 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget) +{ + auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"}); + auto strings_view = cudf::strings_column_view(input); + // replace all occurrences of 'é' with 'e' + cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"}); + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + + auto target = cudf::string_scalar("é", true, stream); + auto repl = cudf::string_scalar("e", true, stream); + auto results = cudf::strings::replace(strings_view, target, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceTest, ReplaceSlice) { std::vector h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""}; diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp index 234d8c4fecc..a13b61e0ba4 100644 --- a/cpp/tests/text/bpe_tests.cpp +++ b/cpp/tests/text/bpe_tests.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -24,44 +24,41 @@ #include #include -struct TextBPETokenize : public cudf::test::BaseFixture {}; +struct TextBytePairEncoding : public cudf::test::BaseFixture {}; -TEST_F(TextBPETokenize, BytePairEncoding) +TEST_F(TextBytePairEncoding, BytePairEncoding) { // partial table based on values from https://huggingface.co/gpt2/raw/main/merges.txt auto mpt = cudf::test::strings_column_wrapper({ - "e n", // 12 - "i t", // 14 - "i s", // 15 - "e s", // 18 - "en t", // 42 - "c e", // 88 - "es t", // 139 - "en ce", // 338 - "T h", // 561 - "Th is", // 956 - "t est", // 9032 - "s ent", // 33830 + "e n", // 14 + "i t", // 16 + "i s", // 17 + "e s", // 20 + "en t", // 44 + "c e", // 90 + "es t", // 141 + "en ce", // 340 + "t h", // 146 + "h i", // 5049 + "th is", // 5407 + "t est", // 9034 + "s i", // 13142 + "s ent" // 33832 }); - nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); auto validity = cudf::test::iterators::null_at(4); - cudf::test::strings_column_wrapper input({" This\tis it\n", - "This is test-sentence-1", - "This is test sentence-2", - "This-is test sentence 3", - "", - ""}, - validity); + cudf::test::strings_column_wrapper input( + {"thisisit", "thisis test-sentence-1", "thisistestsentence-2", "this-istestsentence 3", "", ""}, + validity); auto sv = cudf::strings_column_view(input); - auto results = nvtext::byte_pair_encoding(sv, merge_pairs); - - auto expected = cudf::test::strings_column_wrapper({" This is it", - "This is test - sent ence - 1", - "This is test sent ence - 2", - "This - is test sent ence 3", + auto results = nvtext::byte_pair_encoding(sv, *merge_pairs); + auto expected = cudf::test::strings_column_wrapper({"this is it", + "this is test - sent ence - 1", + "this is test sent ence - 2", + "this - is test sent ence 3", "", ""}, validity); @@ -70,41 +67,68 @@ TEST_F(TextBPETokenize, BytePairEncoding) auto sliced = cudf::slice(input, {1, 4}).front(); auto sliced_expected = cudf::slice(expected, {1, 4}).front(); - results = nvtext::byte_pair_encoding(cudf::strings_column_view(sliced), merge_pairs); + sv = cudf::strings_column_view(sliced); + results = nvtext::byte_pair_encoding(sv, *merge_pairs); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); } -TEST_F(TextBPETokenize, BytePairEncodingSeparator) +TEST_F(TextBytePairEncoding, BytePairEncodingSeparator) { auto mpt = cudf::test::strings_column_wrapper( - {"e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); - nvtext::bpe_merge_pairs merge_pairs{cudf::strings_column_view(mpt)}; + {"Ġ t", "Ġt he", "h e", "e n", "i t", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"}); + + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); cudf::test::strings_column_wrapper input( - {"test-sentence-1", "test sentence-2", "test sentence 3", " test sentence 4 "}); + {"Ġthe test sentence", "test Ġthe sentence", "Ġthetest sentence", "testĠthesentence"}); auto sv = cudf::strings_column_view(input); - auto results = nvtext::byte_pair_encoding(sv, merge_pairs, std::string(" Ġ")); + auto results = nvtext::byte_pair_encoding(sv, *merge_pairs, std::string("$")); + + auto expected = cudf::test::strings_column_wrapper({"Ġthe$ $test$ $sent$ence", + "test$ $Ġthe$ $sent$ence", + "Ġthe$test$ $sent$ence", + "test$Ġthe$sent$ence"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + +TEST_F(TextBytePairEncoding, BPEAdjacentPairs) +{ + auto mpt = cudf::test::strings_column_wrapper({ + "▁ H", // 157 + "m m", // 10742 + "? !", // 50675 + "▁H mm", // 174381 + "mm m", // 262776 + "?! !", // 352313 + "? !?", // 352314 + "mm mm", // 387733 + "▁H m", // 471269 + "?! ?!", // 506981 + "?!? !", // 506982 + }); + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); + + cudf::test::strings_column_wrapper input({"▁Hmmmmm", "?!?!?!"}); - auto expected = cudf::test::strings_column_wrapper( - {"test - sent ence - 1", "test Ġsent ence - 2", "test Ġsent ence Ġ3", " Ġtest Ġsent ence Ġ4"}); + auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(input), *merge_pairs); + auto expected = cudf::test::strings_column_wrapper({"▁Hmm mmm", "?!?! ?!"}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } -TEST_F(TextBPETokenize, BPE_Empty) +TEST_F(TextBytePairEncoding, BPE_Empty) { - auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"}); - nvtext::bpe_merge_pairs merge_pairs{mpt.release()}; - auto empty = cudf::make_empty_column(cudf::type_id::STRING); - auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), merge_pairs); + auto mpt = cudf::test::strings_column_wrapper({"i s", "i t"}); + auto merge_pairs = nvtext::load_merge_pairs(cudf::strings_column_view(mpt)); + auto empty = cudf::make_empty_column(cudf::type_id::STRING); + auto results = nvtext::byte_pair_encoding(cudf::strings_column_view(empty->view()), *merge_pairs); EXPECT_EQ(0, results->size()); } -TEST_F(TextBPETokenize, BPE_Error) +TEST_F(TextBytePairEncoding, BPE_Error) { auto empty = cudf::make_empty_column(cudf::type_id::STRING); - nvtext::bpe_merge_pairs merge_pairs{std::move(empty)}; - cudf::test::strings_column_wrapper input({"isit"}); - EXPECT_THROW(nvtext::byte_pair_encoding(cudf::strings_column_view(input), merge_pairs), - cudf::logic_error); + EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error); + auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0}); + EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error); } diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index d78f2dfbdf3..ea36e13de6f 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -208,14 +208,16 @@ TEST_F(TextTokenizeTest, Vocabulary) {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"}); auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary)); - auto validity = cudf::test::iterators::null_at(1); - cudf::test::strings_column_wrapper input({"the fox jumped over the dog", - "the dog chased the cat", - "the cat chased the mouse", - "the mousé ate cheese", - "", - ""}, - validity); + auto validity = cudf::test::iterators::null_at(5); + auto input = cudf::test::strings_column_wrapper({" the fox jumped over the dog ", + " the dog chased the cat", + "", + "the cat chased the mouse ", + "the mousé ate cheese", + "", + "dog"}, + validity); + auto input_view = cudf::strings_column_view(input); auto delimiter = cudf::string_scalar(" "); auto default_id = -7; // should be the token for the missing 'cat' @@ -225,12 +227,55 @@ TEST_F(TextTokenizeTest, Vocabulary) // clang-format off LCW expected({LCW{ 9, 4, 5, 8, 9, 3}, LCW{ 9, 3, 1, 9,-7}, + LCW{}, LCW{ 9,-7, 1, 9, 6}, LCW{ 9, 7, 0, 2}, - LCW{}, LCW{}}, + LCW{}, LCW{3}}, validity); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto sliced = cudf::slice(input, {1, 4}).front(); + auto sliced_expected = cudf::slice(expected, {1, 4}).front(); + + input_view = cudf::strings_column_view(sliced); + + results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); +} + +TEST_F(TextTokenizeTest, VocabularyLongStrings) +{ + cudf::test::strings_column_wrapper vocabulary( + {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"}); + auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary)); + + std::vector h_strings( + 4, + "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse " + "jumped over the mousé house with the dog "); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end()); + auto input_view = cudf::strings_column_view(input); + auto delimiter = cudf::string_scalar(" "); + auto default_id = -1; + auto results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id); + + using LCW = cudf::test::lists_column_wrapper; + // clang-format off + LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto sliced = cudf::slice(input, {1, 3}).front(); + auto sliced_expected = cudf::slice(expected, {1, 3}).front(); + + input_view = cudf::strings_column_view(sliced); + + results = nvtext::tokenize_with_vocabulary(input_view, *vocab, delimiter, default_id); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), sliced_expected); } TEST_F(TextTokenizeTest, TokenizeErrors) diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 620e0bfe8de..f54ea28d9b2 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -14,28 +14,24 @@ * limitations under the License. */ +#include +#include +#include +#include +#include + #include #include #include #include #include -#include -#include -#include +#include #include -#include #include #include -#include #include #include -#include -#include -#include -#include -#include - #include #include @@ -928,396 +924,6 @@ std::vector bitmask_to_host(cudf::column_view const& c) } } -namespace { - -template >* = nullptr> -static auto numeric_to_string_precise(T value) -{ - return std::to_string(value); -} - -template >* = nullptr> -static auto numeric_to_string_precise(T value) -{ - std::ostringstream o; - o << std::setprecision(std::numeric_limits::max_digits10) << value; - return o.str(); -} - -static auto duration_suffix(cudf::duration_D) { return " days"; } - -static auto duration_suffix(cudf::duration_s) { return " seconds"; } - -static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; } - -static auto duration_suffix(cudf::duration_us) { return " microseconds"; } - -static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; } - -std::string get_nested_type_str(cudf::column_view const& view) -{ - if (view.type().id() == cudf::type_id::LIST) { - lists_column_view lcv(view); - return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">"; - } - - if (view.type().id() == cudf::type_id::STRUCT) { - std::ostringstream out; - - out << cudf::type_to_name(view.type()) + "<"; - std::transform(view.child_begin(), - view.child_end(), - std::ostream_iterator(out, ","), - [&out](auto const col) { return get_nested_type_str(col); }); - out << ">"; - return out.str(); - } - - return cudf::type_to_name(view.type()); -} - -template -std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ") -{ - column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index); - CUDF_EXPECTS(offsets.type().id() == type_id::INT32, - "Column does not appear to be an offsets column"); - CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!"); - size_type output_size = c.size() + 1; - - // the first offset value to normalize everything against - size_type first = - cudf::detail::get_value(offsets, c.offset(), cudf::test::get_default_stream()); - rmm::device_uvector shifted_offsets(output_size, cudf::test::get_default_stream()); - - // normalize the offset values for the column offset - size_type const* d_offsets = offsets.head() + c.offset(); - thrust::transform( - rmm::exec_policy(cudf::test::get_default_stream()), - d_offsets, - d_offsets + output_size, - shifted_offsets.begin(), - [first] __device__(int32_t offset) { return static_cast(offset - first); }); - - auto const h_shifted_offsets = - cudf::detail::make_host_vector_sync(shifted_offsets, cudf::test::get_default_stream()); - std::ostringstream buffer; - for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) { - buffer << h_shifted_offsets[idx]; - if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; } - } - return buffer.str(); -} - -struct column_view_printer { - template ()>* = nullptr> - void operator()(cudf::column_view const& col, std::vector& out, std::string const&) - { - auto h_data = cudf::test::to_host(col); - - out.resize(col.size()); - - if (col.nullable()) { - std::transform(thrust::make_counting_iterator(size_type{0}), - thrust::make_counting_iterator(col.size()), - out.begin(), - [&h_data](auto idx) { - return bit_is_set(h_data.second.data(), idx) - ? numeric_to_string_precise(h_data.first[idx]) - : std::string("NULL"); - }); - - } else { - std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) { - return numeric_to_string_precise(el); - }); - } - } - - template ()>* = nullptr> - void operator()(cudf::column_view const& col, - std::vector& out, - std::string const& indent) - { - // For timestamps, convert timestamp column to column of strings, then - // call string version - std::string format = [&]() { - if constexpr (std::is_same_v) { - return std::string{"%Y-%m-%dT%H:%M:%SZ"}; - } else if constexpr (std::is_same_v) { - return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"}; - } else if constexpr (std::is_same_v) { - return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"}; - } else if constexpr (std::is_same_v) { - return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"}; - } - return std::string{"%Y-%m-%d"}; - }(); - - auto col_as_strings = cudf::strings::from_timestamps(col, format); - if (col_as_strings->size() == 0) { return; } - - this->template operator()(*col_as_strings, out, indent); - } - - template ()>* = nullptr> - void operator()(cudf::column_view const& col, std::vector& out, std::string const&) - { - auto const h_data = cudf::test::to_host(col); - if (col.nullable()) { - std::transform(thrust::make_counting_iterator(size_type{0}), - thrust::make_counting_iterator(col.size()), - std::back_inserter(out), - [&h_data](auto idx) { - return h_data.second.empty() || bit_is_set(h_data.second.data(), idx) - ? static_cast(h_data.first[idx]) - : std::string("NULL"); - }); - } else { - std::transform(std::cbegin(h_data.first), - std::cend(h_data.first), - std::back_inserter(out), - [col](auto const& fp) { return static_cast(fp); }); - } - } - - template >* = nullptr> - void operator()(cudf::column_view const& col, std::vector& out, std::string const&) - { - // - // Implementation for strings, call special to_host variant - // - if (col.is_empty()) return; - auto h_data = cudf::test::to_host(col); - - // explicitly replace some special whitespace characters with their literal equivalents - auto cleaned = [](std::string_view in) { - std::string out(in); - auto replace_char = [](std::string& out, char c, std::string_view repl) { - for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) { - out.replace(pos, 1, repl); - } - }; - replace_char(out, '\a', "\\a"); - replace_char(out, '\b', "\\b"); - replace_char(out, '\f', "\\f"); - replace_char(out, '\r', "\\r"); - replace_char(out, '\t', "\\t"); - replace_char(out, '\n', "\\n"); - replace_char(out, '\v', "\\v"); - return out; - }; - - out.resize(col.size()); - std::transform(thrust::make_counting_iterator(size_type{0}), - thrust::make_counting_iterator(col.size()), - out.begin(), - [&](auto idx) { - return h_data.second.empty() || bit_is_set(h_data.second.data(), idx) - ? cleaned(h_data.first[idx]) - : std::string("NULL"); - }); - } - - template >* = nullptr> - void operator()(cudf::column_view const& col, std::vector& out, std::string const&) - { - cudf::dictionary_column_view dictionary(col); - if (col.is_empty()) return; - std::vector keys = to_strings(dictionary.keys()); - std::vector indices = to_strings({dictionary.indices().type(), - dictionary.size(), - dictionary.indices().head(), - dictionary.null_mask(), - dictionary.null_count(), - dictionary.offset()}); - out.insert(out.end(), keys.begin(), keys.end()); - if (!indices.empty()) { - std::string first = "\x08 : " + indices.front(); // use : as delimiter - out.push_back(first); // between keys and indices - out.insert(out.end(), indices.begin() + 1, indices.end()); - } - } - - // Print the tick counts with the units - template ()>* = nullptr> - void operator()(cudf::column_view const& col, std::vector& out, std::string const&) - { - auto h_data = cudf::test::to_host(col); - - out.resize(col.size()); - - if (col.nullable()) { - std::transform(thrust::make_counting_iterator(size_type{0}), - thrust::make_counting_iterator(col.size()), - out.begin(), - [&h_data](auto idx) { - return bit_is_set(h_data.second.data(), idx) - ? numeric_to_string_precise(h_data.first[idx].count()) + - duration_suffix(h_data.first[idx]) - : std::string("NULL"); - }); - - } else { - std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) { - return numeric_to_string_precise(el.count()) + duration_suffix(el); - }); - } - } - - template >* = nullptr> - void operator()(cudf::column_view const& col, - std::vector& out, - std::string const& indent) - { - lists_column_view lcv(col); - - // propagate slicing to the child if necessary - column_view child = lcv.get_sliced_child(cudf::test::get_default_stream()); - bool const is_sliced = lcv.offset() > 0 || child.offset() > 0; - - std::string tmp = - get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent + - "Length : " + std::to_string(lcv.size()) + "\n" + indent + - "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" + - (lcv.parent().nullable() - ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" + - detail::to_string(bitmask_to_host(col), col.size(), indent) + "\n" - : "") + - // non-nested types don't typically display their null masks, so do it here for convenience. - (!is_nested(child.type()) && child.nullable() - ? " " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n" - : "") + - (detail::to_string(child, ", ", indent + " ")) + "\n"; - - out.push_back(tmp); - } - - template >* = nullptr> - void operator()(cudf::column_view const& col, - std::vector& out, - std::string const& indent) - { - structs_column_view view{col}; - - std::ostringstream out_stream; - - out_stream << get_nested_type_str(col) << ":\n" - << indent << "Length : " << view.size() << ":\n"; - if (view.nullable()) { - out_stream << indent << "Null count: " << view.null_count() << "\n" - << detail::to_string(bitmask_to_host(col), col.size(), indent) << "\n"; - } - - auto iter = thrust::make_counting_iterator(0); - std::transform( - iter, - iter + view.num_children(), - std::ostream_iterator(out_stream, "\n"), - [&](size_type index) { - auto child = view.get_sliced_child(index, cudf::test::get_default_stream()); - - // non-nested types don't typically display their null masks, so do it here for convenience. - return (!is_nested(child.type()) && child.nullable() - ? " " + detail::to_string(bitmask_to_host(child), child.size(), indent) + "\n" - : "") + - detail::to_string(child, ", ", indent + " "); - }); - - out.push_back(out_stream.str()); - } -}; - -} // namespace - -namespace detail { - -/** - * @copydoc cudf::test::detail::to_strings - */ -std::vector to_strings(cudf::column_view const& col, std::string const& indent) -{ - std::vector reply; - cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent); - return reply; -} - -/** - * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string) - * - * @param indent Indentation for all output - */ -std::string to_string(cudf::column_view const& col, - std::string const& delimiter, - std::string const& indent) -{ - std::ostringstream buffer; - std::vector h_data = to_strings(col, indent); - - buffer << indent; - std::copy(h_data.begin(), - h_data.end() - (!h_data.empty()), - std::ostream_iterator(buffer, delimiter.c_str())); - if (!h_data.empty()) buffer << h_data.back(); - - return buffer.str(); -} - -/** - * @copydoc cudf::test::detail::to_string(std::vector, size_type, std::string) - * - * @param indent Indentation for all output. See comment in `to_strings` for - * a detailed description. - */ -std::string to_string(std::vector const& null_mask, - size_type null_mask_size, - std::string const& indent) -{ - std::ostringstream buffer; - buffer << indent; - for (int idx = null_mask_size - 1; idx >= 0; idx--) { - buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0"); - } - return buffer.str(); -} - -} // namespace detail - -/** - * @copydoc cudf::test::to_strings - */ -std::vector to_strings(cudf::column_view const& col) -{ - return detail::to_strings(col); -} - -/** - * @copydoc cudf::test::to_string(cudf::column_view, std::string) - */ -std::string to_string(cudf::column_view const& col, std::string const& delimiter) -{ - return detail::to_string(col, delimiter); -} - -/** - * @copydoc cudf::test::to_string(std::vector, size_type) - */ -std::string to_string(std::vector const& null_mask, size_type null_mask_size) -{ - return detail::to_string(null_mask, null_mask_size); -} - -/** - * @copydoc cudf::test::print - */ -void print(cudf::column_view const& col, std::ostream& os, std::string const& delimiter) -{ - os << to_string(col, delimiter) << std::endl; -} - /** * @copydoc cudf::test::validate_host_masks */ diff --git a/cpp/tests/utilities/debug_utilities.cu b/cpp/tests/utilities/debug_utilities.cu new file mode 100644 index 00000000000..a8a43ffb4ca --- /dev/null +++ b/cpp/tests/utilities/debug_utilities.cu @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include + +namespace cudf::test { + +// Forward declaration. +namespace detail { + +/** + * @brief Formats a column view as a string + * + * @param col The column view + * @param delimiter The delimiter to put between strings + * @param indent Indentation for all output + */ +std::string to_string(cudf::column_view const& col, + std::string const& delimiter, + std::string const& indent = ""); + +/** + * @brief Formats a null mask as a string + * + * @param null_mask The null mask buffer + * @param null_mask_size Size of the null mask (in rows) + * @param indent Indentation for all output + */ +std::string to_string(std::vector const& null_mask, + size_type null_mask_size, + std::string const& indent = ""); + +/** + * @brief Convert column values to a host vector of strings + * + * Supports indentation of all output. For example, if the displayed output of your column + * would be + * + * @code{.pseudo} + * "1,2,3,4,5" + * @endcode + * and the `indent` parameter was " ", that indentation would be prepended to + * result in the output + * @code{.pseudo} + * " 1,2,3,4,5" + * @endcode + * + * The can be useful for displaying complex types. An example use case would be for + * displaying the nesting of a LIST type column (via recursion). + * + * List>: + * Length : 3 + * Offsets : 0, 2, 5, 6 + * Children : + * List: + * Length : 6 + * Offsets : 0, 2, 4, 7, 8, 9, 11 + * Children : + * 1, 2, 3, 4, 5, 6, 7, 0, 8, 9, 10 + * + * @param col The column view + * @param indent Indentation for all output + */ +std::vector to_strings(cudf::column_view const& col, std::string const& indent = ""); + +} // namespace detail + +namespace { + +template >* = nullptr> +static auto numeric_to_string_precise(T value) +{ + return std::to_string(value); +} + +template >* = nullptr> +static auto numeric_to_string_precise(T value) +{ + std::ostringstream o; + o << std::setprecision(std::numeric_limits::max_digits10) << value; + return o.str(); +} + +static auto duration_suffix(cudf::duration_D) { return " days"; } + +static auto duration_suffix(cudf::duration_s) { return " seconds"; } + +static auto duration_suffix(cudf::duration_ms) { return " milliseconds"; } + +static auto duration_suffix(cudf::duration_us) { return " microseconds"; } + +static auto duration_suffix(cudf::duration_ns) { return " nanoseconds"; } + +std::string get_nested_type_str(cudf::column_view const& view) +{ + if (view.type().id() == cudf::type_id::LIST) { + lists_column_view lcv(view); + return cudf::type_to_name(view.type()) + "<" + (get_nested_type_str(lcv.child())) + ">"; + } + + if (view.type().id() == cudf::type_id::STRUCT) { + std::ostringstream out; + + out << cudf::type_to_name(view.type()) + "<"; + std::transform(view.child_begin(), + view.child_end(), + std::ostream_iterator(out, ","), + [&out](auto const col) { return get_nested_type_str(col); }); + out << ">"; + return out.str(); + } + + return cudf::type_to_name(view.type()); +} + +template +std::string nested_offsets_to_string(NestedColumnView const& c, std::string const& delimiter = ", ") +{ + column_view offsets = (c.parent()).child(NestedColumnView::offsets_column_index); + CUDF_EXPECTS(offsets.type().id() == type_id::INT32, + "Column does not appear to be an offsets column"); + CUDF_EXPECTS(offsets.offset() == 0, "Offsets column has an internal offset!"); + size_type output_size = c.size() + 1; + + // the first offset value to normalize everything against + size_type first = + cudf::detail::get_value(offsets, c.offset(), cudf::get_default_stream()); + rmm::device_uvector shifted_offsets(output_size, cudf::get_default_stream()); + + // normalize the offset values for the column offset + size_type const* d_offsets = offsets.head() + c.offset(); + thrust::transform( + rmm::exec_policy(cudf::get_default_stream()), + d_offsets, + d_offsets + output_size, + shifted_offsets.begin(), + [first] __device__(int32_t offset) { return static_cast(offset - first); }); + + auto const h_shifted_offsets = + cudf::detail::make_host_vector_sync(shifted_offsets, cudf::get_default_stream()); + std::ostringstream buffer; + for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) { + buffer << h_shifted_offsets[idx]; + if (idx < h_shifted_offsets.size() - 1) { buffer << delimiter; } + } + return buffer.str(); +} + +struct column_view_printer { + template ()>* = nullptr> + void operator()(cudf::column_view const& col, std::vector& out, std::string const&) + { + auto h_data = cudf::test::to_host(col); + + out.resize(col.size()); + + if (col.nullable()) { + std::transform(thrust::make_counting_iterator(size_type{0}), + thrust::make_counting_iterator(col.size()), + out.begin(), + [&h_data](auto idx) { + return bit_is_set(h_data.second.data(), idx) + ? numeric_to_string_precise(h_data.first[idx]) + : std::string("NULL"); + }); + + } else { + std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) { + return numeric_to_string_precise(el); + }); + } + } + + template ()>* = nullptr> + void operator()(cudf::column_view const& col, + std::vector& out, + std::string const& indent) + { + // For timestamps, convert timestamp column to column of strings, then + // call string version + std::string format = [&]() { + if constexpr (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%SZ"}; + } else if constexpr (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"}; + } else if constexpr (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"}; + } else if constexpr (std::is_same_v) { + return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"}; + } + return std::string{"%Y-%m-%d"}; + }(); + + auto col_as_strings = cudf::strings::from_timestamps(col, format); + if (col_as_strings->size() == 0) { return; } + + this->template operator()(*col_as_strings, out, indent); + } + + template ()>* = nullptr> + void operator()(cudf::column_view const& col, std::vector& out, std::string const&) + { + auto const h_data = cudf::test::to_host(col); + if (col.nullable()) { + std::transform(thrust::make_counting_iterator(size_type{0}), + thrust::make_counting_iterator(col.size()), + std::back_inserter(out), + [&h_data](auto idx) { + return h_data.second.empty() || bit_is_set(h_data.second.data(), idx) + ? static_cast(h_data.first[idx]) + : std::string("NULL"); + }); + } else { + std::transform(std::cbegin(h_data.first), + std::cend(h_data.first), + std::back_inserter(out), + [col](auto const& fp) { return static_cast(fp); }); + } + } + + template >* = nullptr> + void operator()(cudf::column_view const& col, std::vector& out, std::string const&) + { + // + // Implementation for strings, call special to_host variant + // + if (col.is_empty()) return; + auto h_data = cudf::test::to_host(col); + + // explicitly replace some special whitespace characters with their literal equivalents + auto cleaned = [](std::string_view in) { + std::string out(in); + auto replace_char = [](std::string& out, char c, std::string_view repl) { + for (std::string::size_type pos{}; out.npos != (pos = out.find(c, pos)); pos++) { + out.replace(pos, 1, repl); + } + }; + replace_char(out, '\a', "\\a"); + replace_char(out, '\b', "\\b"); + replace_char(out, '\f', "\\f"); + replace_char(out, '\r', "\\r"); + replace_char(out, '\t', "\\t"); + replace_char(out, '\n', "\\n"); + replace_char(out, '\v', "\\v"); + return out; + }; + + out.resize(col.size()); + std::transform(thrust::make_counting_iterator(size_type{0}), + thrust::make_counting_iterator(col.size()), + out.begin(), + [&](auto idx) { + return h_data.second.empty() || bit_is_set(h_data.second.data(), idx) + ? cleaned(h_data.first[idx]) + : std::string("NULL"); + }); + } + + template >* = nullptr> + void operator()(cudf::column_view const& col, std::vector& out, std::string const&) + { + cudf::dictionary_column_view dictionary(col); + if (col.is_empty()) return; + std::vector keys = to_strings(dictionary.keys()); + std::vector indices = to_strings({dictionary.indices().type(), + dictionary.size(), + dictionary.indices().head(), + dictionary.null_mask(), + dictionary.null_count(), + dictionary.offset()}); + out.insert(out.end(), keys.begin(), keys.end()); + if (!indices.empty()) { + std::string first = "\x08 : " + indices.front(); // use : as delimiter + out.push_back(first); // between keys and indices + out.insert(out.end(), indices.begin() + 1, indices.end()); + } + } + + // Print the tick counts with the units + template ()>* = nullptr> + void operator()(cudf::column_view const& col, std::vector& out, std::string const&) + { + auto h_data = cudf::test::to_host(col); + + out.resize(col.size()); + + if (col.nullable()) { + std::transform(thrust::make_counting_iterator(size_type{0}), + thrust::make_counting_iterator(col.size()), + out.begin(), + [&h_data](auto idx) { + return bit_is_set(h_data.second.data(), idx) + ? numeric_to_string_precise(h_data.first[idx].count()) + + duration_suffix(h_data.first[idx]) + : std::string("NULL"); + }); + + } else { + std::transform(h_data.first.begin(), h_data.first.end(), out.begin(), [](Element el) { + return numeric_to_string_precise(el.count()) + duration_suffix(el); + }); + } + } + + template >* = nullptr> + void operator()(cudf::column_view const& col, + std::vector& out, + std::string const& indent) + { + lists_column_view lcv(col); + + // propagate slicing to the child if necessary + column_view child = lcv.get_sliced_child(cudf::get_default_stream()); + bool const is_sliced = lcv.offset() > 0 || child.offset() > 0; + + std::string tmp = + get_nested_type_str(col) + (is_sliced ? "(sliced)" : "") + ":\n" + indent + + "Length : " + std::to_string(lcv.size()) + "\n" + indent + + "Offsets : " + (lcv.size() > 0 ? nested_offsets_to_string(lcv) : "") + "\n" + + (lcv.parent().nullable() + ? indent + "Null count: " + std::to_string(lcv.null_count()) + "\n" + + detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) + "\n" + : "") + + // non-nested types don't typically display their null masks, so do it here for convenience. + (!is_nested(child.type()) && child.nullable() + ? " " + detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) + + "\n" + : "") + + (detail::to_string(child, ", ", indent + " ")) + "\n"; + + out.push_back(tmp); + } + + template >* = nullptr> + void operator()(cudf::column_view const& col, + std::vector& out, + std::string const& indent) + { + structs_column_view view{col}; + + std::ostringstream out_stream; + + out_stream << get_nested_type_str(col) << ":\n" + << indent << "Length : " << view.size() << ":\n"; + if (view.nullable()) { + out_stream << indent << "Null count: " << view.null_count() << "\n" + << detail::to_string(cudf::test::bitmask_to_host(col), col.size(), indent) << "\n"; + } + + auto iter = thrust::make_counting_iterator(0); + std::transform( + iter, + iter + view.num_children(), + std::ostream_iterator(out_stream, "\n"), + [&](size_type index) { + auto child = view.get_sliced_child(index, cudf::get_default_stream()); + + // non-nested types don't typically display their null masks, so do it here for convenience. + return (!is_nested(child.type()) && child.nullable() + ? " " + + detail::to_string(cudf::test::bitmask_to_host(child), child.size(), indent) + + "\n" + : "") + + detail::to_string(child, ", ", indent + " "); + }); + + out.push_back(out_stream.str()); + } +}; + +} // namespace + +namespace detail { + +/** + * @copydoc cudf::test::detail::to_strings + */ +std::vector to_strings(cudf::column_view const& col, std::string const& indent) +{ + std::vector reply; + cudf::type_dispatcher(col.type(), column_view_printer{}, col, reply, indent); + return reply; +} + +/** + * @copydoc cudf::test::detail::to_string(cudf::column_view, std::string, std::string) + * + * @param indent Indentation for all output + */ +std::string to_string(cudf::column_view const& col, + std::string const& delimiter, + std::string const& indent) +{ + std::ostringstream buffer; + std::vector h_data = to_strings(col, indent); + + buffer << indent; + std::copy(h_data.begin(), + h_data.end() - (!h_data.empty()), + std::ostream_iterator(buffer, delimiter.c_str())); + if (!h_data.empty()) buffer << h_data.back(); + + return buffer.str(); +} + +/** + * @copydoc cudf::test::detail::to_string(std::vector, size_type, std::string) + * + * @param indent Indentation for all output. See comment in `to_strings` for + * a detailed description. + */ +std::string to_string(std::vector const& null_mask, + size_type null_mask_size, + std::string const& indent) +{ + std::ostringstream buffer; + buffer << indent; + for (int idx = null_mask_size - 1; idx >= 0; idx--) { + buffer << (cudf::bit_is_set(null_mask.data(), idx) ? "1" : "0"); + } + return buffer.str(); +} + +} // namespace detail + +std::vector to_strings(cudf::column_view const& col) +{ + return detail::to_strings(col); +} + +std::string to_string(cudf::column_view const& col, std::string const& delimiter) +{ + return detail::to_string(col, delimiter); +} + +std::string to_string(std::vector const& null_mask, size_type null_mask_size) +{ + return detail::to_string(null_mask, null_mask_size); +} + +void print(cudf::column_view const& col, std::ostream& os) +{ + os << to_string(col, ",") << std::endl; +} + +} // namespace cudf::test diff --git a/cpp/tests/utilities_tests/column_debug_tests.cpp b/cpp/tests/utilities_tests/column_debug_tests.cpp new file mode 100644 index 00000000000..0dae407ad21 --- /dev/null +++ b/cpp/tests/utilities_tests/column_debug_tests.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#include + +template +struct ColumnDebugTestIntegral : public cudf::test::BaseFixture {}; +template +struct ColumnDebugTestFloatingPoint : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(ColumnDebugTestIntegral, cudf::test::IntegralTypes); +TYPED_TEST_SUITE(ColumnDebugTestFloatingPoint, cudf::test::FloatingPointTypes); + +TYPED_TEST(ColumnDebugTestIntegral, PrintColumnNumeric) +{ + char const* delimiter = ","; + + cudf::test::fixed_width_column_wrapper cudf_col({1, 2, 3, 4, 5}); + auto std_col = cudf::test::make_type_param_vector({1, 2, 3, 4, 5}); + + std::stringstream tmp; + auto string_iter = + thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); }); + + std::copy(string_iter, + string_iter + std_col.size() - 1, + std::ostream_iterator(tmp, delimiter)); + + tmp << std::to_string(std_col.back()); + + EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str()); +} + +TYPED_TEST(ColumnDebugTestIntegral, PrintColumnWithInvalids) +{ + char const* delimiter = ","; + + cudf::test::fixed_width_column_wrapper cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}}; + auto std_col = cudf::test::make_type_param_vector({1, 2, 3, 4, 5}); + + std::ostringstream tmp; + tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter + << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter + << std::to_string(std_col[4]); + + EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str()); +} + +TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnNumeric) +{ + char const* delimiter = ","; + + cudf::test::fixed_width_column_wrapper cudf_col( + {10001523.25, 2.0, 3.75, 0.000000034, 5.3}); + + auto expected = std::is_same_v + ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998" + : "10001523,2,3.75,3.39999993e-08,5.30000019"; + + EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected); +} + +TYPED_TEST(ColumnDebugTestFloatingPoint, PrintColumnWithInvalids) +{ + char const* delimiter = ","; + + cudf::test::fixed_width_column_wrapper cudf_col( + {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1}); + + auto expected = std::is_same_v + ? "10001523.25,NULL,3.75,NULL,5.2999999999999998" + : "10001523,NULL,3.75,NULL,5.30000019"; + + EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected); +} + +struct ColumnDebugStringsTest : public cudf::test::BaseFixture {}; + +TEST_F(ColumnDebugStringsTest, PrintColumnDuration) +{ + char const* delimiter = ","; + + cudf::test::fixed_width_column_wrapper cudf_col({100, 0, 7, 140000}); + + auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds"; + + EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected); +} + +TEST_F(ColumnDebugStringsTest, StringsToString) +{ + char const* delimiter = ","; + + std::vector h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; + cudf::test::strings_column_wrapper strings( + h_strings.begin(), + h_strings.end(), + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + + std::ostringstream tmp; + tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter + << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter + << h_strings[6]; + + EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str()); +} + +TEST_F(ColumnDebugStringsTest, PrintEscapeStrings) +{ + char const* delimiter = ","; + cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"}); + std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"}; + EXPECT_EQ(cudf::test::to_string(input, delimiter), expected); +} diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp index 90a7270cb29..07d2bea2b28 100644 --- a/cpp/tests/utilities_tests/column_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp @@ -182,106 +182,6 @@ TEST_F(ColumnUtilitiesStringsTest, StringsToHostAllNulls) EXPECT_TRUE(std::all_of(results.begin(), results.end(), [](auto s) { return s.empty(); })); } -TEST_F(ColumnUtilitiesStringsTest, PrintColumnDuration) -{ - char const* delimiter = ","; - - cudf::test::fixed_width_column_wrapper cudf_col({100, 0, 7, 140000}); - - auto expected = "100 seconds,0 seconds,7 seconds,140000 seconds"; - - EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected); -} - -TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnNumeric) -{ - char const* delimiter = ","; - - cudf::test::fixed_width_column_wrapper cudf_col({1, 2, 3, 4, 5}); - auto std_col = cudf::test::make_type_param_vector({1, 2, 3, 4, 5}); - - std::stringstream tmp; - auto string_iter = - thrust::make_transform_iterator(std::begin(std_col), [](auto e) { return std::to_string(e); }); - - std::copy(string_iter, - string_iter + std_col.size() - 1, - std::ostream_iterator(tmp, delimiter)); - - tmp << std::to_string(std_col.back()); - - EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str()); -} - -TYPED_TEST(ColumnUtilitiesTestIntegral, PrintColumnWithInvalids) -{ - char const* delimiter = ","; - - cudf::test::fixed_width_column_wrapper cudf_col{{1, 2, 3, 4, 5}, {1, 0, 1, 0, 1}}; - auto std_col = cudf::test::make_type_param_vector({1, 2, 3, 4, 5}); - - std::ostringstream tmp; - tmp << std::to_string(std_col[0]) << delimiter << "NULL" << delimiter - << std::to_string(std_col[2]) << delimiter << "NULL" << delimiter - << std::to_string(std_col[4]); - - EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), tmp.str()); -} - -TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnNumeric) -{ - char const* delimiter = ","; - - cudf::test::fixed_width_column_wrapper cudf_col( - {10001523.25, 2.0, 3.75, 0.000000034, 5.3}); - - auto expected = std::is_same_v - ? "10001523.25,2,3.75,3.4e-08,5.2999999999999998" - : "10001523,2,3.75,3.39999993e-08,5.30000019"; - - EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected); -} - -TYPED_TEST(ColumnUtilitiesTestFloatingPoint, PrintColumnWithInvalids) -{ - char const* delimiter = ","; - - cudf::test::fixed_width_column_wrapper cudf_col( - {10001523.25, 2.0, 3.75, 0.000000034, 5.3}, {1, 0, 1, 0, 1}); - - auto expected = std::is_same_v - ? "10001523.25,NULL,3.75,NULL,5.2999999999999998" - : "10001523,NULL,3.75,NULL,5.30000019"; - - EXPECT_EQ(cudf::test::to_string(cudf_col, delimiter), expected); -} - -TEST_F(ColumnUtilitiesStringsTest, StringsToString) -{ - char const* delimiter = ","; - - std::vector h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - std::ostringstream tmp; - tmp << h_strings[0] << delimiter << h_strings[1] << delimiter << "NULL" << delimiter - << h_strings[3] << delimiter << h_strings[4] << delimiter << h_strings[5] << delimiter - << h_strings[6]; - - EXPECT_EQ(cudf::test::to_string(strings, delimiter), tmp.str()); -} - -TEST_F(ColumnUtilitiesStringsTest, PrintEscapeStrings) -{ - char const* delimiter = ","; - cudf::test::strings_column_wrapper input({"e\te\ne", "é\bé\ré", "e\vé\fé\abell"}); - std::string expected{"e\\te\\ne,é\\bé\\ré,e\\vé\\fé\\abell"}; - EXPECT_EQ(cudf::test::to_string(input, delimiter), expected); -} - TYPED_TEST(ColumnUtilitiesTestFixedPoint, NonNullableToHost) { using namespace numeric; diff --git a/dependencies.yaml b/dependencies.yaml index 72aaaa1b3fc..97149a5e2ba 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -9,8 +9,8 @@ files: - build_all - build_cpp - build_wheels - - build_python - build_python_common + - build_python_cudf - cudatoolkit - develop - docs @@ -62,6 +62,7 @@ files: includes: - cudatoolkit - docs + - libarrow_run - py_version py_build_cudf: output: pyproject @@ -70,8 +71,8 @@ files: table: build-system includes: - build_all - - build_python - build_python_common + - build_python_cudf - build_wheels py_run_cudf: output: pyproject @@ -137,8 +138,8 @@ files: extras: table: build-system includes: - - build_wheels - build_python_common + - build_wheels py_run_cudf_kafka: output: pyproject pyproject_dir: python/cudf_kafka @@ -230,8 +231,8 @@ dependencies: common: - output_types: [conda, requirements] packages: - - librmm==23.10.* - - libkvikio==23.10.* + - librmm==23.12.* + - libkvikio==23.12.* - output_types: conda packages: - fmt>=9.1.0,<10 @@ -240,27 +241,11 @@ dependencies: - &gmock gmock>=1.13.0 # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - - libarrow==12.0.1.* + - libarrow-all==14.0.1.* - librdkafka>=1.9.0,<1.10.0a0 + # Align nvcomp version with rapids-cmake + - nvcomp==3.0.4 - spdlog>=1.11.0,<1.12 - specific: - - output_types: conda - matrices: - - matrix: - arch: x86_64 - packages: - # Align nvcomp version with rapids-cmake - # TODO: not yet available for aarch64 CUDA 12 - - &nvcomp nvcomp==2.6.1 - - matrix: - arch: aarch64 - cuda: "11.8" - packages: - - *nvcomp - # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3, - # all CUDA/arch combinations should be supported by existing packages. - - matrix: - packages: build_wheels: common: - output_types: pyproject @@ -271,18 +256,20 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - cython>=3.0.0 - # Hard pin the patch version used during the build. This must be kept - # in sync with the version pinned in get_arrow.cmake. - - pyarrow==12.0.1.* + - cython>=3.0.3 # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - &numpy numpy>=1.21,<1.25 - build_python: + - scikit-build>=0.13.1 + - output_types: [conda, requirements, pyproject] + packages: + # Hard pin the patch version used during the build. This must be kept + # in sync with the version pinned in get_arrow.cmake. + - pyarrow==14.0.1.* + build_python_cudf: common: - output_types: [conda, requirements, pyproject] packages: - - scikit-build>=0.13.1 - - rmm==23.10.* + - rmm==23.12.* - output_types: conda packages: - &protobuf protobuf>=4.21,<5 @@ -291,16 +278,18 @@ dependencies: - protoc-wheel libarrow_run: common: - - output_types: [conda, requirements] + - output_types: conda packages: # Allow runtime version to float up to minor version - - libarrow==12.* + # Disallow libarrow 14.0.0 due to a CVE + - libarrow-all>=14.0.1,<15.0.0a0 pyarrow_run: common: - output_types: [conda, requirements, pyproject] packages: # Allow runtime version to float up to minor version - - pyarrow==12.* + # Disallow pyarrow 14.0.0 due to a CVE + - pyarrow>=14.0.1,<15.0.0a0 cudatoolkit: specific: - output_types: conda @@ -401,15 +390,15 @@ dependencies: common: - output_types: [conda] packages: - - dask-cuda==23.10.* + - dask-cuda==23.12.* - *doxygen - - libarrow==12.0.1.* - make - myst-nb - nbsphinx - numpydoc - pandoc - - pydata-sphinx-theme + # https://github.com/pydata/pydata-sphinx-theme/issues/1539 + - pydata-sphinx-theme!=0.14.2 - scipy - sphinx - sphinx-autobuild @@ -455,7 +444,7 @@ dependencies: - nvtx>=0.2.1 - packaging - rich - - rmm==23.10.* + - rmm==23.12.* - typing_extensions>=4.0.0 - *protobuf - output_types: conda @@ -508,15 +497,13 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask==2023.9.2 - - distributed==2023.9.2 + - rapids-dask-dependency==23.12.* - output_types: conda packages: - cupy>=12.0.0 - - dask-core==2023.9.2 # dask-core in conda is the actual package & dask is the meta package - output_types: pyproject packages: - - &cudf cudf==23.10.* + - &cudf cudf==23.12.* - *cupy_pip run_cudf_kafka: common: @@ -535,7 +522,7 @@ dependencies: packages: - confluent-kafka>=1.9.0,<1.10.0a0 - *cudf - - cudf_kafka==23.10.* + - cudf_kafka==23.12.* test_cpp: common: - output_types: conda @@ -580,7 +567,6 @@ dependencies: - fastavro>=0.22.9 - hypothesis - mimesis>=4.1.0 - - pyorc - pytest-benchmark - pytest-cases - python-snappy>=0.6.0 @@ -618,7 +604,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask-cuda==23.10.* + - dask-cuda==23.12.* - *numba test_python_pandas_cudf: common: diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 03b1bb7039b..28e305b71cb 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -79,9 +79,9 @@ # built documents. # # The short X.Y version. -version = '23.10' +version = '23.12' # The full version, including alpha/beta/rc tags. -release = '23.10.00' +release = '23.12.00' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -106,6 +106,7 @@ "twitter_url": "https://twitter.com/rapidsai", "show_toc_level": 1, "navbar_align": "right", + "navigation_with_keys": True, } include_pandas_compat = True diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md index 1f4cfbc7366..e6fe3109c57 100644 --- a/docs/cudf/source/user_guide/data-types.md +++ b/docs/cudf/source/user_guide/data-types.md @@ -136,7 +136,7 @@ dtype: struct StructDtype({'a': dtype('int64'), 'b': dtype('int64')}) ``` -Or by reading them from disk, using a [file format that supports nested data](io). +Or by reading them from disk, using a [file format that supports nested data](/user_guide/io/index.md). ```python >>> pdf = pd.DataFrame({"a": [[1, 2], [3, 4, 5], [6, 7, 8]]}) diff --git a/docs/dask_cudf/source/conf.py b/docs/dask_cudf/source/conf.py index ad629b5e949..00568a57431 100644 --- a/docs/dask_cudf/source/conf.py +++ b/docs/dask_cudf/source/conf.py @@ -11,8 +11,8 @@ project = "dask-cudf" copyright = "2018-2023, NVIDIA Corporation" author = "NVIDIA Corporation" -version = '23.10' -release = '23.10.00' +version = '23.12' +release = '23.12.00' language = "en" @@ -57,6 +57,7 @@ "twitter_url": "https://twitter.com/rapidsai", "show_toc_level": 1, "navbar_align": "right", + "navigation_with_keys": True, } include_pandas_compat = True diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake index 4a68c7dbc60..e79d9d86fce 100644 --- a/fetch_rapids.cmake +++ b/fetch_rapids.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake) - file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake ) endif() diff --git a/java/ci/README.md b/java/ci/README.md index e9599b33bf1..12a2bb2dc51 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.10 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-23.12 ``` ### Build cuDF jar with devtoolset @@ -47,4 +47,4 @@ scl enable devtoolset-11 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-23.10.0-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-23.12.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index afcc0e15a2c..cc880312d34 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 23.10.0-SNAPSHOT + 23.12.0-SNAPSHOT cudfjni diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java index e1298e29925..7cc3d30a9cf 100755 --- a/java/src/main/java/ai/rapids/cudf/Cuda.java +++ b/java/src/main/java/ai/rapids/cudf/Cuda.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,9 +15,6 @@ */ package ai.rapids.cudf; -import ai.rapids.cudf.NvtxColor; -import ai.rapids.cudf.NvtxRange; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -90,6 +87,21 @@ private Stream() { this.id = -1; } + private Stream(long id) { + this.cleaner = null; + this.id = id; + } + + /** + * Wrap a given stream ID to make it accessible. + */ + static Stream wrap(long id) { + if (id == -1) { + return DEFAULT_STREAM; + } + return new Stream(id); + } + /** * Have this stream not execute new work until the work recorded in event completes. * @param event the event to wait on. @@ -122,7 +134,9 @@ public synchronized void close() { cleaner.delRef(); } if (closed) { - cleaner.logRefCountDebug("double free " + this); + if (cleaner != null) { + cleaner.logRefCountDebug("double free " + this); + } throw new IllegalStateException("Close called too many times " + this); } if (cleaner != null) { diff --git a/java/src/main/java/ai/rapids/cudf/DataSource.java b/java/src/main/java/ai/rapids/cudf/DataSource.java new file mode 100644 index 00000000000..1e5893235df --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/DataSource.java @@ -0,0 +1,189 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; + +/** + * Base class that can be used to provide data dynamically to CUDF. This follows somewhat + * closely with cudf::io::datasource. There are a few main differences. + *
+ * First this does not expose async device reads. It will call the non-async device read API + * instead. This might be added in the future, but there was no direct use case for it in java + * right now to warrant the added complexity. + *
+ * Second there is no implementation of the device read API that returns a buffer instead of + * writing into one. This is not used by CUDF yet so testing an implementation that isn't used + * didn't feel ideal. If it is needed we will add one in the future. + */ +public abstract class DataSource implements AutoCloseable { + private static final Logger log = LoggerFactory.getLogger(DataSource.class); + + /** + * This is used to keep track of the HostMemoryBuffers in java land so the C++ layer + * does not have to do it. + */ + private final HashMap cachedBuffers = new HashMap<>(); + + @Override + public void close() { + if (!cachedBuffers.isEmpty()) { + throw new IllegalStateException("DataSource closed before all returned host buffers were closed"); + } + } + + /** + * Get the size of the source in bytes. + */ + public abstract long size(); + + /** + * Read data from the source at the given offset. Return a HostMemoryBuffer for the data + * that was read. + * @param offset where to start reading from. + * @param amount the maximum number of bytes to read. + * @return a buffer that points to the data. + * @throws IOException on any error. + */ + public abstract HostMemoryBuffer hostRead(long offset, long amount) throws IOException; + + + /** + * Called when the buffer returned from hostRead is done. The default is to close the buffer. + */ + protected void onHostBufferDone(HostMemoryBuffer buffer) { + if (buffer != null) { + buffer.close(); + } + } + + /** + * Read data from the source at the given offset into dest. Note that dest should not be closed, + * and no reference to it can outlive the call to hostRead. The target amount to read is + * dest's length. + * @param offset the offset to start reading from in the source. + * @param dest where to write the data. + * @return the actual number of bytes written to dest. + */ + public abstract long hostRead(long offset, HostMemoryBuffer dest) throws IOException; + + /** + * Return true if this supports reading directly to the device else false. The default is + * no device support. This cannot change dynamically. It is typically read just once. + */ + public boolean supportsDeviceRead() { + return false; + } + + /** + * Get the size cutoff between device reads and host reads when device reads are supported. + * Anything larger than the cutoff will be a device read and anything smaller will be a + * host read. By default, the cutoff is 0 so all reads will be device reads if device reads + * are supported. + */ + public long getDeviceReadCutoff() { + return 0; + } + + /** + * Read data from the source at the given offset into dest. Note that dest should not be closed, + * and no reference to it can outlive the call to hostRead. The target amount to read is + * dest's length. + * @param offset the offset to start reading from + * @param dest where to write the data. + * @param stream the stream to do the copy on. + * @return the actual number of bytes written to dest. + */ + public long deviceRead(long offset, DeviceMemoryBuffer dest, + Cuda.Stream stream) throws IOException { + throw new IllegalStateException("Device read is not implemented"); + } + + ///////////////////////////////////////////////// + // Internal methods called from JNI + ///////////////////////////////////////////////// + + private static class NoopCleaner extends MemoryBuffer.MemoryBufferCleaner { + @Override + protected boolean cleanImpl(boolean logErrorIfNotClean) { + return true; + } + + @Override + public boolean isClean() { + return true; + } + } + private static final NoopCleaner cleaner = new NoopCleaner(); + + // Called from JNI + private void onHostBufferDone(long bufferId) { + HostMemoryBuffer hmb = cachedBuffers.remove(bufferId); + if (hmb != null) { + onHostBufferDone(hmb); + } else { + // Called from C++ destructor so avoid throwing... + log.warn("Got a close callback for a buffer we could not find " + bufferId); + } + } + + // Called from JNI + private long hostRead(long offset, long amount, long dst) throws IOException { + if (amount < 0) { + throw new IllegalArgumentException("Cannot allocate more than " + Long.MAX_VALUE + " bytes"); + } + try (HostMemoryBuffer dstBuffer = new HostMemoryBuffer(dst, amount, cleaner)) { + return hostRead(offset, dstBuffer); + } + } + + // Called from JNI + private long[] hostReadBuff(long offset, long amount) throws IOException { + if (amount < 0) { + throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes"); + } + HostMemoryBuffer buff = hostRead(offset, amount); + long[] ret = new long[3]; + if (buff != null) { + long id = buff.id; + if (cachedBuffers.put(id, buff) != null) { + throw new IllegalStateException("Already had a buffer cached for " + buff); + } + ret[0] = buff.address; + ret[1] = buff.length; + ret[2] = id; + } // else they are all 0 because java does that already + return ret; + } + + // Called from JNI + private long deviceRead(long offset, long amount, long dst, long stream) throws IOException { + if (amount < 0) { + throw new IllegalArgumentException("Cannot read more than " + Long.MAX_VALUE + " bytes"); + } + Cuda.Stream strm = Cuda.Stream.wrap(stream); + try (DeviceMemoryBuffer dstBuffer = new DeviceMemoryBuffer(dst, amount, cleaner)) { + return deviceRead(offset, dstBuffer, strm); + } + } +} diff --git a/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java new file mode 100644 index 00000000000..5d4dcb8e4e7 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/DataSourceHelper.java @@ -0,0 +1,44 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * This is here because we need some JNI methods to work with a DataSource, but + * we also want to cache callback methods at startup for performance reasons. If + * we put both in the same class we will get a deadlock because of how we load + * the JNI. We have a static block that blocks loading the class until the JNI + * library is loaded and the JNI library cannot load until the class is loaded + * and cached. This breaks the loop. + */ +class DataSourceHelper { + static { + NativeDepsLoader.loadNativeDeps(); + } + + static long createWrapperDataSource(DataSource ds) { + return createWrapperDataSource(ds, ds.size(), ds.supportsDeviceRead(), + ds.getDeviceReadCutoff()); + } + + private static native long createWrapperDataSource(DataSource ds, long size, + boolean deviceReadSupport, + long deviceReadCutoff); + + static native void destroyWrapperDataSource(long handle); +} diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java index c4d9bdb8f91..9eab607ed0b 100644 --- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,6 +112,10 @@ public static DeviceMemoryBuffer fromRmm(long address, long lengthInBytes, long return new DeviceMemoryBuffer(address, lengthInBytes, rmmBufferAddress); } + DeviceMemoryBuffer(long address, long lengthInBytes, MemoryBufferCleaner cleaner) { + super(address, lengthInBytes, cleaner); + } + DeviceMemoryBuffer(long address, long lengthInBytes, long rmmBufferAddress) { super(address, lengthInBytes, new RmmDeviceBufferCleaner(rmmBufferAddress)); } diff --git a/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java new file mode 100644 index 00000000000..6986b6a7fec --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/MultiBufferDataSource.java @@ -0,0 +1,230 @@ +/* + * + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +/** + * This is a DataSource that can take multiple HostMemoryBuffers. They + * are treated as if they are all part of a single file connected end to end. + */ +public class MultiBufferDataSource extends DataSource { + private final long sizeInBytes; + private final HostMemoryBuffer[] hostBuffers; + private final long[] startOffsets; + private final HostMemoryAllocator allocator; + + // Metrics + private long hostReads = 0; + private long hostReadBytes = 0; + private long devReads = 0; + private long devReadBytes = 0; + + /** + * Create a new data source backed by multiple buffers. + * @param buffers the buffers that will back the data source. + */ + public MultiBufferDataSource(HostMemoryBuffer ... buffers) { + this(DefaultHostMemoryAllocator.get(), buffers); + } + + /** + * Create a new data source backed by multiple buffers. + * @param allocator the allocator to use for host buffers, if needed. + * @param buffers the buffers that will back the data source. + */ + public MultiBufferDataSource(HostMemoryAllocator allocator, HostMemoryBuffer ... buffers) { + int numBuffers = buffers.length; + hostBuffers = new HostMemoryBuffer[numBuffers]; + startOffsets = new long[numBuffers]; + + long currentOffset = 0; + for (int i = 0; i < numBuffers; i++) { + HostMemoryBuffer hmb = buffers[i]; + hmb.incRefCount(); + hostBuffers[i] = hmb; + startOffsets[i] = currentOffset; + currentOffset += hmb.getLength(); + } + sizeInBytes = currentOffset; + this.allocator = allocator; + } + + @Override + public long size() { + return sizeInBytes; + } + + private int getStartBufferIndexForOffset(long offset) { + assert (offset >= 0); + + // It is super common to read from the start or end of a file (the header or footer) + // so special case them + if (offset == 0) { + return 0; + } + int startIndex = 0; + int endIndex = startOffsets.length - 1; + if (offset >= startOffsets[endIndex]) { + return endIndex; + } + while (startIndex != endIndex) { + int midIndex = (int)(((long)startIndex + endIndex) / 2); + long midStartOffset = startOffsets[midIndex]; + if (offset >= midStartOffset) { + // It is either in mid or after mid. + if (midIndex == endIndex || offset <= startOffsets[midIndex + 1]) { + // We found it in mid + return midIndex; + } else { + // It is after mid + startIndex = midIndex + 1; + } + } else { + // It is before mid + endIndex = midIndex - 1; + } + } + return startIndex; + } + + + interface DoCopy { + void copyFromHostBuffer(T dest, long destOffset, HostMemoryBuffer src, + long srcOffset, long srcAmount); + } + + private long read(long offset, T dest, DoCopy doCopy) { + assert (offset >= 0); + long realOffset = Math.min(offset, sizeInBytes); + long realAmount = Math.min(sizeInBytes - realOffset, dest.getLength()); + + int index = getStartBufferIndexForOffset(realOffset); + + HostMemoryBuffer buffer = hostBuffers[index]; + long bufferOffset = realOffset - startOffsets[index]; + long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount); + long remainingAmount = realAmount; + long currentOffset = realOffset; + long outputOffset = 0; + + while (remainingAmount > 0) { + doCopy.copyFromHostBuffer(dest, outputOffset, buffer, + bufferOffset, bufferAmount); + remainingAmount -= bufferAmount; + outputOffset += bufferAmount; + currentOffset += bufferAmount; + index++; + if (index < hostBuffers.length) { + buffer = hostBuffers[index]; + bufferOffset = currentOffset - startOffsets[index]; + bufferAmount = Math.min(buffer.length - bufferOffset, remainingAmount); + } + } + + return realAmount; + } + + @Override + public HostMemoryBuffer hostRead(long offset, long amount) { + assert (offset >= 0); + assert (amount >= 0); + long realOffset = Math.min(offset, sizeInBytes); + long realAmount = Math.min(sizeInBytes - realOffset, amount); + + int index = getStartBufferIndexForOffset(realOffset); + + HostMemoryBuffer buffer = hostBuffers[index]; + long bufferOffset = realOffset - startOffsets[index]; + long bufferAmount = Math.min(buffer.length - bufferOffset, realAmount); + if (bufferAmount == realAmount) { + hostReads += 1; + hostReadBytes += realAmount; + // It all fits in a single buffer, so do a zero copy operation + return buffer.slice(bufferOffset, bufferAmount); + } else { + // We will have to allocate a new buffer and copy data into it. + boolean success = false; + HostMemoryBuffer ret = allocator.allocate(realAmount, true); + try { + long amountRead = read(offset, ret, HostMemoryBuffer::copyFromHostBuffer); + assert(amountRead == realAmount); + hostReads += 1; + hostReadBytes += amountRead; + success = true; + return ret; + } finally { + if (!success) { + ret.close(); + } + } + } + } + + @Override + public long hostRead(long offset, HostMemoryBuffer dest) { + long ret = read(offset, dest, HostMemoryBuffer::copyFromHostBuffer); + hostReads += 1; + hostReadBytes += ret; + return ret; + } + + @Override + public boolean supportsDeviceRead() { + return true; + } + + @Override + public long deviceRead(long offset, DeviceMemoryBuffer dest, + Cuda.Stream stream) { + long ret = read(offset, dest, (destParam, destOffset, src, srcOffset, srcAmount) -> + destParam.copyFromHostBufferAsync(destOffset, src, srcOffset, srcAmount, stream)); + devReads += 1; + devReadBytes += ret; + return ret; + } + + + @Override + public void close() { + try { + super.close(); + } finally { + for (HostMemoryBuffer hmb: hostBuffers) { + if (hmb != null) { + hmb.close(); + } + } + } + } + + public long getHostReads() { + return hostReads; + } + + public long getHostReadBytes() { + return hostReadBytes; + } + + public long getDevReads() { + return devReads; + } + + public long getDevReadBytes() { + return devReadBytes; + } +} diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java index c34336ac73f..17d59b757c3 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,7 +51,7 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()); - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } } @@ -71,18 +71,45 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()); - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } } + /** + * Construct a reader instance from a DataSource + * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read, + * or 0 if there is no limit. + * @param opts The options for Parquet reading. + * @param ds the data source to read from + */ + public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, DataSource ds) { + dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + if (dataSourceHandle == 0) { + throw new IllegalStateException("Cannot create native datasource object"); + } + + boolean passed = false; + try { + handle = createWithDataSource(chunkSizeByteLimit, opts.getIncludeColumnNames(), + opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(), + dataSourceHandle); + passed = true; + } finally { + if (!passed) { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + dataSourceHandle = 0; + } + } + } + /** * Check if the given file has anything left to read. * * @return A boolean value indicating if there is more data to read from file. */ public boolean hasNext() { - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Native chunked Parquet reader object may have been closed."); } @@ -104,7 +131,7 @@ public boolean hasNext() { * @return A table of new rows reading from the given file. */ public Table readChunk() { - if(handle == 0) { + if (handle == 0) { throw new IllegalStateException("Native chunked Parquet reader object may have been closed."); } @@ -118,6 +145,10 @@ public void close() { close(handle); handle = 0; } + if (dataSourceHandle != 0) { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + dataSourceHandle = 0; + } } @@ -131,6 +162,7 @@ public void close() { */ private long handle; + private long dataSourceHandle = 0; /** * Create a native chunked Parquet reader object on heap and return its memory address. @@ -147,6 +179,9 @@ public void close() { private static native long create(long chunkSizeByteLimit, String[] filterColumnNames, boolean[] binaryToString, String filePath, long bufferAddrs, long length, int timeUnit); + private static native long createWithDataSource(long chunkedSizeByteLimit, + String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle); + private static native boolean hasNext(long handle); private static native long[] readChunk(long handle); diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 51a33ebb72f..3bd1e3f25a7 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -235,6 +235,14 @@ private static native long[] readCSV(String[] columnNames, byte comment, String[] nullValues, String[] trueValues, String[] falseValues) throws CudfException; + private static native long[] readCSVFromDataSource(String[] columnNames, + int[] dTypeIds, int[] dTypeScales, + String[] filterColumnNames, + int headerRow, byte delim, int quoteStyle, byte quote, + byte comment, String[] nullValues, + String[] trueValues, String[] falseValues, + long dataSourceHandle) throws CudfException; + /** * read JSON data and return a pointer to a TableWithMeta object. */ @@ -244,6 +252,12 @@ private static native long readJSON(String[] columnNames, boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException; + private static native long readJSONFromDataSource(String[] columnNames, + int[] dTypeIds, int[] dTypeScales, + boolean dayFirst, boolean lines, + boolean recoverWithNulls, + long dsHandle) throws CudfException; + private static native long readAndInferJSON(long address, long length, boolean dayFirst, boolean lines, boolean recoverWithNulls) throws CudfException; @@ -260,6 +274,10 @@ private static native long readAndInferJSON(long address, long length, private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, long address, long length, int timeUnit) throws CudfException; + private static native long[] readParquetFromDataSource(String[] filterColumnNames, + boolean[] binaryToString, int timeUnit, + long dataSourceHandle) throws CudfException; + /** * Read in Avro formatted data. * @param filterColumnNames name of the columns to read, or an empty array if we want to read @@ -271,6 +289,9 @@ private static native long[] readParquet(String[] filterColumnNames, boolean[] b private static native long[] readAvro(String[] filterColumnNames, String filePath, long address, long length) throws CudfException; + private static native long[] readAvroFromDataSource(String[] filterColumnNames, + long dataSourceHandle) throws CudfException; + /** * Setup everything to write parquet formatted data to a file. * @param columnNames names that correspond to the table columns @@ -372,6 +393,11 @@ private static native long[] readORC(String[] filterColumnNames, boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns) throws CudfException; + private static native long[] readORCFromDataSource(String[] filterColumnNames, + boolean usingNumPyTypes, int timeUnit, + String[] decimal128Columns, + long dataSourceHandle) throws CudfException; + /** * Setup everything to write ORC formatted data to a file. * @param columnNames names that correspond to the table columns @@ -881,6 +907,27 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf opts.getFalseValues())); } + public static Table readCSV(Schema schema, CSVOptions opts, DataSource ds) { + long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readCSVFromDataSource(schema.getColumnNames(), + schema.getTypeIds(), + schema.getTypeScales(), + opts.getIncludeColumnNames(), + opts.getHeaderRow(), + opts.getDelim(), + opts.getQuoteStyle().nativeId, + opts.getQuote(), + opts.getComment(), + opts.getNullValues(), + opts.getTrueValues(), + opts.getFalseValues(), + dsHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dsHandle); + } + } + private static native void writeCSVToFile(long table, String[] columnNames, boolean includeHeader, @@ -1128,6 +1175,24 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b } } + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param ds the DataSource to read from. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { + long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getColumnNames(), + schema.getTypeIds(), schema.getTypeScales(), opts.isDayFirst(), opts.isLines(), + opts.isRecoverWithNull(), dsHandle))) { + return gatherJSONColumns(schema, twm); + } finally { + DataSourceHelper.destroyWrapperDataSource(dsHandle); + } + } + /** * Read a Parquet file using the default ParquetOptions. * @param path the local file to read. @@ -1214,6 +1279,17 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); } + public static Table readParquet(ParquetOptions opts, DataSource ds) { + long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readParquetFromDataSource(opts.getIncludeColumnNames(), + opts.getReadBinaryAsString(), opts.timeUnit().typeId.getNativeId(), + dataSourceHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + } + } + /** * Read an Avro file using the default AvroOptions. * @param path the local file to read. @@ -1297,6 +1373,16 @@ public static Table readAvro(AvroOptions opts, HostMemoryBuffer buffer, null, buffer.getAddress() + offset, len)); } + public static Table readAvro(AvroOptions opts, DataSource ds) { + long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readAvroFromDataSource(opts.getIncludeColumnNames(), + dataSourceHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + } + } + /** * Read a ORC file using the default ORCOptions. * @param path the local file to read. @@ -1388,6 +1474,17 @@ public static Table readORC(ORCOptions opts, HostMemoryBuffer buffer, opts.getDecimal128Columns())); } + public static Table readORC(ORCOptions opts, DataSource ds) { + long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); + try { + return new Table(readORCFromDataSource(opts.getIncludeColumnNames(), + opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(), + opts.getDecimal128Columns(), dataSourceHandle)); + } finally { + DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); + } + } + private static class ParquetTableWriter extends TableWriter { HostBufferConsumer consumer; @@ -2262,7 +2359,7 @@ public Table dropDuplicates(int[] keyColumns, DuplicateKeepOption keep, boolean /** * Count how many rows in the table are distinct from one another. - * @param nullEqual if nulls should be considered equal to each other or not. + * @param nullsEqual if nulls should be considered equal to each other or not. */ public int distinctCount(NullEquality nullsEqual) { return distinctCount(nativeHandle, nullsEqual.nullsEqual); diff --git a/java/src/main/java/ai/rapids/cudf/ast/Literal.java b/java/src/main/java/ai/rapids/cudf/ast/Literal.java index 427dd286b0c..4e1e886c282 100644 --- a/java/src/main/java/ai/rapids/cudf/ast/Literal.java +++ b/java/src/main/java/ai/rapids/cudf/ast/Literal.java @@ -20,6 +20,7 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; +import java.nio.charset.StandardCharsets; /** A literal value in an AST expression. */ public final class Literal extends AstExpression { @@ -205,7 +206,14 @@ public static Literal ofString(String value) { if (value == null) { return ofNull(DType.STRING); } - byte[] stringBytes = value.getBytes(); + return ofUTF8String(value.getBytes(StandardCharsets.UTF_8)); + } + + /** Construct a string literal directly with byte array to skip transcoding. */ + public static Literal ofUTF8String(byte[] stringBytes) { + if (stringBytes == null) { + return ofNull(DType.STRING); + } byte[] serializedValue = new byte[stringBytes.length + Integer.BYTES]; ByteBuffer.wrap(serializedValue).order(ByteOrder.nativeOrder()).putInt(stringBytes.length); System.arraycopy(stringBytes, 0, serializedValue, Integer.BYTES, stringBytes.length); diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 128989fe77f..01161a03dd4 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI) project( CUDF_JNI - VERSION 23.10.00 + VERSION 23.12.00 LANGUAGES C CXX CUDA ) @@ -135,6 +135,7 @@ add_library( src/ColumnViewJni.cu src/CompiledExpression.cpp src/ContiguousTableJni.cpp + src/DataSourceHelperJni.cpp src/HashJoinJni.cpp src/HostMemoryBufferNativeUtilsJni.cpp src/NvcompJni.cpp diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp index 8d0a8bdbfe7..0044385f267 100644 --- a/java/src/main/native/src/ChunkedReaderJni.cpp +++ b/java/src/main/native/src/ChunkedReaderJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -85,6 +85,40 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create( CATCH_STD(env, 0); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource( + JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names, + jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) { + JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0); + JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + // TODO: This variable is unused now, but we still don't know what to do with it yet. + // As such, it needs to stay here for a little more time before we decide to use it again, + // or remove it completely. + cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); + (void)n_col_binary_read; + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto opts_builder = cudf::io::parquet_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector()); + } + auto const read_opts = opts_builder.convert_strings_to_categories(false) + .timestamp_type(cudf::data_type(static_cast(unit))) + .build(); + + return reinterpret_cast(new cudf::io::chunked_parquet_reader( + static_cast(chunk_read_limit), read_opts)); + } + CATCH_STD(env, 0); +} + JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass, jlong handle) { JNI_NULL_CHECK(env, handle, "handle is null", false); diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 0ddaa2c15b5..7a626daff1f 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -62,7 +63,6 @@ #include #include #include -#include #include #include #include @@ -1130,7 +1130,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas } if (n_data_type.id() == cudf::type_id::STRING) { switch (column->type().id()) { - case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::from_booleans(*column)); + case cudf::type_id::BOOL8: { + auto const true_scalar = cudf::string_scalar("true"); + auto const false_scalar = cudf::string_scalar("false"); + return release_as_jlong(cudf::strings::from_booleans(*column, true_scalar, false_scalar)); + } case cudf::type_id::FLOAT32: case cudf::type_id::FLOAT64: return release_as_jlong(cudf::strings::from_floats(*column)); case cudf::type_id::INT8: @@ -1149,7 +1153,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas } } else if (column->type().id() == cudf::type_id::STRING) { switch (n_data_type.id()) { - case cudf::type_id::BOOL8: return release_as_jlong(cudf::strings::to_booleans(*column)); + case cudf::type_id::BOOL8: { + auto const true_scalar = cudf::string_scalar("true"); + return release_as_jlong(cudf::strings::to_booleans(*column, true_scalar)); + } case cudf::type_id::FLOAT32: case cudf::type_id::FLOAT64: return release_as_jlong(cudf::strings::to_floats(*column, n_data_type)); @@ -2436,7 +2443,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env cudf::column_view *n_column_view = reinterpret_cast(j_view_handle); cudf::strings_column_view n_strings_col_view(*n_column_view); cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); - return release_as_jlong(cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path)); + return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path)); } CATCH_STD(env, 0) } diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp index 0f143086451..d0a25d449a6 100644 --- a/java/src/main/native/src/CudfJni.cpp +++ b/java/src/main/native/src/CudfJni.cpp @@ -175,6 +175,14 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) { return JNI_ERR; } + if (!cudf::jni::cache_data_source_jni(env)) { + if (!env->ExceptionCheck()) { + env->ThrowNew(env->FindClass("java/lang/RuntimeException"), + "Unable to locate data source helper methods needed by JNI"); + } + return JNI_ERR; + } + return cudf::jni::MINIMUM_JNI_VERSION; } diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp new file mode 100644 index 00000000000..8d0e4d36413 --- /dev/null +++ b/java/src/main/native/src/DataSourceHelperJni.cpp @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "cudf_jni_apis.hpp" +#include "jni_utils.hpp" + +namespace { + +#define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource" + +jclass DataSource_jclass; +jmethodID hostRead_method; +jmethodID hostReadBuff_method; +jmethodID onHostBufferDone_method; +jmethodID deviceRead_method; + +} // anonymous namespace + +namespace cudf { +namespace jni { +bool cache_data_source_jni(JNIEnv *env) { + jclass cls = env->FindClass(DATA_SOURCE_CLASS); + if (cls == nullptr) { + return false; + } + + hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J"); + if (hostRead_method == nullptr) { + return false; + } + + hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J"); + if (hostReadBuff_method == nullptr) { + return false; + } + + onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V"); + if (onHostBufferDone_method == nullptr) { + return false; + } + + deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J"); + if (deviceRead_method == nullptr) { + return false; + } + + // Convert local reference to global so it cannot be garbage collected. + DataSource_jclass = static_cast(env->NewGlobalRef(cls)); + if (DataSource_jclass == nullptr) { + return false; + } + return true; +} + +void release_data_source_jni(JNIEnv *env) { + DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass); +} + +class host_buffer_done_callback { +public: + explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {} + + host_buffer_done_callback(host_buffer_done_callback const &other) = delete; + host_buffer_done_callback(host_buffer_done_callback &&other) + : jvm(other.jvm), ds(other.ds), id(other.id) { + other.jvm = nullptr; + other.ds = nullptr; + other.id = -1; + } + + host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete; + host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete; + + ~host_buffer_done_callback() { + // because we are in a destructor we cannot throw an exception, so for now we are + // just going to keep the java exceptions around and have them be thrown when this + // thread returns to the JVM. It might be kind of confusing, but we will not lose + // them. + if (jvm != nullptr) { + // We cannot throw an exception in the destructor, so this is really best effort + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) { + env->CallVoidMethod(this->ds, onHostBufferDone_method, id); + } + } + } + +private: + JavaVM *jvm; + jobject ds; + long id; +}; + +class jni_datasource : public cudf::io::datasource { +public: + explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported, + size_t device_read_cutoff) + : ds_size(ds_size), device_read_supported(device_read_supported), + device_read_cutoff(device_read_cutoff) { + if (env->GetJavaVM(&jvm) < 0) { + throw std::runtime_error("GetJavaVM failed"); + } + this->ds = add_global_ref(env, ds); + } + + virtual ~jni_datasource() { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) { + ds = del_global_ref(env, ds); + } + ds = nullptr; + } + + std::unique_ptr host_read(size_t offset, size_t size) override { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) { + throw cudf::jni::jni_exception("Could not load JNIEnv"); + } + + jlongArray jbuffer_info = + static_cast(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size)); + if (env->ExceptionOccurred()) { + throw cudf::jni::jni_exception("Java exception in hostRead"); + } + + cudf::jni::native_jlongArray buffer_info(env, jbuffer_info); + auto ptr = reinterpret_cast(buffer_info[0]); + size_t length = buffer_info[1]; + long id = buffer_info[2]; + + cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id); + return std::make_unique>(std::move(cb), ptr, + length); + } + + size_t host_read(size_t offset, size_t size, uint8_t *dst) override { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) { + throw cudf::jni::jni_exception("Could not load JNIEnv"); + } + + jlong amount_read = + env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast(dst)); + if (env->ExceptionOccurred()) { + throw cudf::jni::jni_exception("Java exception in hostRead"); + } + return amount_read; + } + + size_t size() const override { return ds_size; } + + bool supports_device_read() const override { return device_read_supported; } + + bool is_device_read_preferred(size_t size) const override { + return device_read_supported && size >= device_read_cutoff; + } + + size_t device_read(size_t offset, size_t size, uint8_t *dst, + rmm::cuda_stream_view stream) override { + JNIEnv *env = nullptr; + if (jvm->GetEnv(reinterpret_cast(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) { + throw cudf::jni::jni_exception("Could not load JNIEnv"); + } + + jlong amount_read = + env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast(dst), + reinterpret_cast(stream.value())); + if (env->ExceptionOccurred()) { + throw cudf::jni::jni_exception("Java exception in deviceRead"); + } + return amount_read; + } + + std::future device_read_async(size_t offset, size_t size, uint8_t *dst, + rmm::cuda_stream_view stream) override { + auto amount_read = device_read(offset, size, dst, stream); + // This is a bit ugly, but we don't have a good way or a need to return + // a future for the read + std::promise ret; + ret.set_value(amount_read); + return ret.get_future(); + } + +private: + size_t ds_size; + bool device_read_supported; + size_t device_read_cutoff; + JavaVM *jvm; + jobject ds; +}; +} // namespace jni +} // namespace cudf + +extern "C" { + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource( + JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported, + jlong device_read_cutoff) { + JNI_NULL_CHECK(env, ds, "Null data source", 0); + try { + cudf::jni::auto_set_device(env); + auto source = + new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff); + return reinterpret_cast(source); + } + CATCH_STD(env, 0); +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env, + jclass, + jlong handle) { + try { + cudf::jni::auto_set_device(env); + if (handle != 0) { + auto source = reinterpret_cast(handle); + delete (source); + } + } + CATCH_STD(env, ); +} + +} // extern "C" diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index b208ef8f381..fad19bdf895 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1135,6 +1135,67 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass CATCH_STD(env, NULL); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource( + JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, + jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote, + jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values, + jlong ds_handle) { + JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL); + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL); + + try { + cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray n_col_names(env, col_names); + cudf::jni::native_jintArray n_types(env, j_types); + cudf::jni::native_jintArray n_scales(env, j_scales); + if (n_types.is_null() != n_scales.is_null()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null", + NULL); + } + std::vector data_types; + if (!n_types.is_null()) { + if (n_types.size() != n_scales.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", + NULL); + } + data_types.reserve(n_types.size()); + std::transform(n_types.begin(), n_types.end(), n_scales.begin(), + std::back_inserter(data_types), [](auto type, auto scale) { + return cudf::data_type{static_cast(type), scale}; + }); + } + + cudf::jni::native_jstringArray n_null_values(env, null_values); + cudf::jni::native_jstringArray n_true_values(env, true_values); + cudf::jni::native_jstringArray n_false_values(env, false_values); + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto const quote_style = static_cast(j_quote_style); + + cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source) + .delimiter(delim) + .header(header_row) + .names(n_col_names.as_cpp_vector()) + .dtypes(data_types) + .use_cols_names(n_filter_col_names.as_cpp_vector()) + .true_values(n_true_values.as_cpp_vector()) + .false_values(n_false_values.as_cpp_vector()) + .na_values(n_null_values.as_cpp_vector()) + .keep_default_na(false) + .na_filter(n_null_values.size() > 0) + .quoting(quote_style) + .quotechar(quote) + .comment(comment) + .build(); + + return convert_table_for_return(env, cudf::io::read_csv(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length, @@ -1407,6 +1468,72 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE CATCH_STD(env, nullptr); } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( + JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, + jboolean day_first, jboolean lines, jboolean recover_with_null, jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::jni::native_jstringArray n_col_names(env, col_names); + cudf::jni::native_jintArray n_types(env, j_types); + cudf::jni::native_jintArray n_scales(env, j_scales); + if (n_types.is_null() != n_scales.is_null()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null", + 0); + } + std::vector data_types; + if (!n_types.is_null()) { + if (n_types.size() != n_scales.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size", + 0); + } + data_types.reserve(n_types.size()); + std::transform(n_types.begin(), n_types.end(), n_scales.begin(), + std::back_inserter(data_types), [](auto const &type, auto const &scale) { + return cudf::data_type{static_cast(type), scale}; + }); + } + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + cudf::io::json_recovery_mode_t recovery_mode = + recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL : + cudf::io::json_recovery_mode_t::FAIL; + cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(source) + .dayfirst(static_cast(day_first)) + .lines(static_cast(lines)) + .recovery_mode(recovery_mode); + + if (!n_col_names.is_null() && data_types.size() > 0) { + if (n_col_names.size() != n_types.size()) { + JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", + "types and column names must match size", 0); + } + + std::map map; + + auto col_names_vec = n_col_names.as_cpp_vector(); + std::transform(col_names_vec.begin(), col_names_vec.end(), data_types.begin(), + std::inserter(map, map.end()), + [](std::string a, cudf::data_type b) { return std::make_pair(a, b); }); + opts.dtypes(map); + } else if (data_types.size() > 0) { + opts.dtypes(data_types); + } else { + // should infer the types + } + + auto result = + std::make_unique(cudf::io::read_json(opts.build())); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, @@ -1489,6 +1616,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource( + JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit, + jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto builder = cudf::io::parquet_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + builder = builder.columns(n_filter_col_names.as_cpp_vector()); + } + + cudf::io::parquet_reader_options opts = + builder.convert_strings_to_categories(false) + .timestamp_type(cudf::data_type(static_cast(unit))) + .build(); + return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) { @@ -1535,10 +1692,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet( CATCH_STD(env, NULL); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource( + JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source) + .columns(n_filter_col_names.as_cpp_vector()) + .build(); + return convert_table_for_return(env, cudf::io::read_avro(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, - jlong buffer_length, jint unit) { + jlong buffer_length) { const bool read_buffer = (buffer != 0); if (!read_buffer) { @@ -1715,6 +1893,38 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc CATCH_STD(env, ) } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource( + JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit, + jobjectArray dec128_col_names, jlong ds_handle) { + + JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); + + try { + cudf::jni::auto_set_device(env); + + cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); + + cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names); + + auto ds = reinterpret_cast(ds_handle); + cudf::io::source_info source{ds}; + + auto builder = cudf::io::orc_reader_options::builder(source); + if (n_filter_col_names.size() > 0) { + builder = builder.columns(n_filter_col_names.as_cpp_vector()); + } + + cudf::io::orc_reader_options opts = + builder.use_index(false) + .use_np_dtypes(static_cast(usingNumPyTypes)) + .timestamp_type(cudf::data_type(static_cast(unit))) + .decimal128_columns(n_dec128_col_names.as_cpp_vector()) + .build(); + return convert_table_for_return(env, cudf::io::read_orc(opts).tbl); + } + CATCH_STD(env, NULL); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC( JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) { diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp index 867df80b722..bd82bbd2899 100644 --- a/java/src/main/native/src/cudf_jni_apis.hpp +++ b/java/src/main/native/src/cudf_jni_apis.hpp @@ -134,5 +134,13 @@ void auto_set_device(JNIEnv *env); */ void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value); +// +// DataSource APIs +// + +bool cache_data_source_jni(JNIEnv *env); + +void release_data_source_jni(JNIEnv *env); + } // namespace jni } // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index faa73ac4322..b0dd4122b0e 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -327,6 +327,25 @@ void testReadJSONFile() { } } + @Test + void testReadJSONFromDataSource() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "name") + .column(DType.INT32, "age") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("Michael", "Andy", "Justin") + .column(null, 30, 19) + .build(); + MultiBufferDataSource source = sourceFrom(TEST_SIMPLE_JSON_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadJSONFileWithInvalidLines() { Schema schema = Schema.builder() @@ -560,6 +579,126 @@ void testReadCSVBuffer() { } } + byte[][] sliceBytes(byte[] data, int slices) { + slices = Math.min(data.length, slices); + // We are not going to worry about making it super even here. + // The last one gets the extras. + int bytesPerSlice = data.length / slices; + byte[][] ret = new byte[slices][]; + int startingAt = 0; + for (int i = 0; i < (slices - 1); i++) { + ret[i] = new byte[bytesPerSlice]; + System.arraycopy(data, startingAt, ret[i], 0, bytesPerSlice); + startingAt += bytesPerSlice; + } + // Now for the last one + ret[slices - 1] = new byte[data.length - startingAt]; + System.arraycopy(data, startingAt, ret[slices - 1], 0, data.length - startingAt); + return ret; + } + + @Test + void testReadCSVBufferMultiBuffer() { + CSVOptions opts = CSVOptions.builder() + .includeColumn("A") + .includeColumn("B") + .hasHeader() + .withDelim('|') + .withQuote('\'') + .withNullValue("NULL") + .build(); + byte[][] data = sliceBytes(CSV_DATA_BUFFER, 10); + try (Table expected = new Table.TestBuilder() + .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8) + .build(); + MultiBufferDataSource source = sourceFrom(data); + Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + + public static byte[] arrayFrom(File f) throws IOException { + long len = f.length(); + if (len > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Sorry cannot read " + f + + " into an array it does not fit"); + } + int remaining = (int)len; + byte[] ret = new byte[remaining]; + try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) { + int at = 0; + while (remaining > 0) { + int amount = fin.read(ret, at, remaining); + at += amount; + remaining -= amount; + } + } + return ret; + } + + public static MultiBufferDataSource sourceFrom(File f) throws IOException { + long len = f.length(); + byte[] tmp = new byte[(int)Math.min(32 * 1024, len)]; + try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) { + try (java.io.FileInputStream fin = new java.io.FileInputStream(f)) { + long at = 0; + while (at < len) { + int amount = fin.read(tmp); + buffer.setBytes(at, tmp, 0, amount); + at += amount; + } + } + return new MultiBufferDataSource(buffer); + } + } + + public static MultiBufferDataSource sourceFrom(byte[] data) { + long len = data.length; + try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(len)) { + buffer.setBytes(0, data, 0, len); + return new MultiBufferDataSource(buffer); + } + } + + public static MultiBufferDataSource sourceFrom(byte[][] data) { + HostMemoryBuffer[] buffers = new HostMemoryBuffer[data.length]; + try { + for (int i = 0; i < data.length; i++) { + byte[] subData = data[i]; + buffers[i] = HostMemoryBuffer.allocate(subData.length); + buffers[i].setBytes(0, subData, 0, subData.length); + } + return new MultiBufferDataSource(buffers); + } finally { + for (HostMemoryBuffer buffer: buffers) { + if (buffer != null) { + buffer.close(); + } + } + } + } + + @Test + void testReadCSVDataSource() { + CSVOptions opts = CSVOptions.builder() + .includeColumn("A") + .includeColumn("B") + .hasHeader() + .withDelim('|') + .withQuote('\'') + .withNullValue("NULL") + .build(); + try (Table expected = new Table.TestBuilder() + .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) + .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, null, 118.2, 119.8) + .build(); + MultiBufferDataSource source = sourceFrom(TableTest.CSV_DATA_BUFFER); + Table table = Table.readCSV(TableTest.CSV_DATA_BUFFER_SCHEMA, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadCSVWithOffset() { CSVOptions opts = CSVOptions.builder() @@ -864,6 +1003,37 @@ void testReadParquet() { } } + @Test + void testReadParquetFromDataSource() throws IOException { + ParquetOptions opts = ParquetOptions.builder() + .includeColumn("loan_id") + .includeColumn("zip") + .includeColumn("num_units") + .build(); + try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE); + Table table = Table.readParquet(opts, source)) { + long rows = table.getRowCount(); + assertEquals(1000, rows); + assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table); + } + } + + @Test + void testReadParquetMultiBuffer() throws IOException { + ParquetOptions opts = ParquetOptions.builder() + .includeColumn("loan_id") + .includeColumn("zip") + .includeColumn("num_units") + .build(); + byte [][] data = sliceBytes(arrayFrom(TEST_PARQUET_FILE), 10); + try (MultiBufferDataSource source = sourceFrom(data); + Table table = Table.readParquet(opts, source)) { + long rows = table.getRowCount(); + assertEquals(1000, rows); + assertTableTypes(new DType[]{DType.INT64, DType.INT32, DType.INT32}, table); + } + } + @Test void testReadParquetBinary() { ParquetOptions opts = ParquetOptions.builder() @@ -1018,6 +1188,23 @@ void testChunkedReadParquet() { } } + @Test + void testChunkedReadParquetFromDataSource() throws IOException { + try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ); + ParquetChunkedReader reader = new ParquetChunkedReader(240000, ParquetOptions.DEFAULT, source)) { + int numChunks = 0; + long totalRows = 0; + while(reader.hasNext()) { + ++numChunks; + try(Table chunk = reader.readChunk()) { + totalRows += chunk.getRowCount(); + } + } + assertEquals(2, numChunks); + assertEquals(40000, totalRows); + } + } + @Test void testReadAvro() { AvroOptions opts = AvroOptions.builder() @@ -1037,6 +1224,26 @@ void testReadAvro() { } } + @Test + void testReadAvroFromDataSource() throws IOException { + AvroOptions opts = AvroOptions.builder() + .includeColumn("bool_col") + .includeColumn("int_col") + .includeColumn("timestamp_col") + .build(); + + try (Table expected = new Table.TestBuilder() + .column(true, false, true, false, true, false, true, false) + .column(0, 1, 0, 1, 0, 1, 0, 1) + .column(1235865600000000L, 1235865660000000L, 1238544000000000L, 1238544060000000L, + 1233446400000000L, 1233446460000000L, 1230768000000000L, 1230768060000000L) + .build(); + MultiBufferDataSource source = sourceFrom(TEST_ALL_TYPES_PLAIN_AVRO_FILE); + Table table = Table.readAvro(opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadAvroBuffer() throws IOException{ AvroOptions opts = AvroOptions.builder() @@ -1094,6 +1301,24 @@ void testReadORC() { } } + @Test + void testReadORCFromDataSource() throws IOException { + ORCOptions opts = ORCOptions.builder() + .includeColumn("string1") + .includeColumn("float1") + .includeColumn("int1") + .build(); + try (Table expected = new Table.TestBuilder() + .column("hi","bye") + .column(1.0f,2.0f) + .column(65536,65536) + .build(); + MultiBufferDataSource source = sourceFrom(TEST_ORC_FILE); + Table table = Table.readORC(opts, source)) { + assertTablesAreEqual(expected, table); + } + } + @Test void testReadORCBuffer() throws IOException { ORCOptions opts = ORCOptions.builder() diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 6f3e428d291..a8b91c27095 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) -set(cudf_version 23.10.00) +set(cudf_version 23.12.00) include(../../fetch_rapids.cmake) include(rapids-cuda) diff --git a/python/cudf/cudf/VERSION b/python/cudf/cudf/VERSION new file mode 120000 index 00000000000..d62dc733efd --- /dev/null +++ b/python/cudf/cudf/VERSION @@ -0,0 +1 @@ +../../../VERSION \ No newline at end of file diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index e5c78fca893..02274a5fdd1 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -17,6 +17,7 @@ from rmm.allocators.numba import RMMNumbaManager from cudf import api, core, datasets, testing +from cudf._version import __git_commit__, __version__ from cudf.api.extensions import ( register_dataframe_accessor, register_index_accessor, @@ -99,8 +100,6 @@ rmm.register_reinitialize_hook(clear_cache) -__version__ = "23.10.00" - __all__ = [ "BaseIndex", "CategoricalDtype", diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index 65d2e09988f..ecddc72fa85 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import copy import io @@ -6,14 +6,13 @@ import random import numpy as np -import pyorc +import pyarrow as pa import cudf from cudf._fuzz_testing.io import IOFuzz from cudf._fuzz_testing.utils import ( ALL_POSSIBLE_VALUES, _generate_rand_meta, - pandas_to_orc, pyarrow_to_pandas, ) from cudf.testing import dataset_generator as dg @@ -82,12 +81,7 @@ def generate_input(self): logging.info(f"Shape of DataFrame generated: {table.shape}") self._df = df file_obj = io.BytesIO() - pandas_to_orc( - df, - file_io_obj=file_obj, - stripe_size=self._rand(len(df)), - arrow_table_schema=table.schema, - ) + pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df))) file_obj.seek(0) buf = file_obj.read() self._current_buffer = copy.copy(buf) @@ -109,8 +103,8 @@ def set_rand_params(self, params): ) elif param == "stripes": f = io.BytesIO(self._current_buffer) - reader = pyorc.Reader(f) - stripes = [i for i in range(reader.num_of_stripes)] + orcFile = pa.orc.ORCFile(f) + stripes = list(range(orcFile.nstripes)) params_dict[param] = np.random.choice( [ None, @@ -119,7 +113,7 @@ def set_rand_params(self, params): int, np.unique( np.random.choice( - stripes, reader.num_of_stripes + stripes, orcFile.nstripes ) ), ) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 03418e00cde..0c88c1aeacd 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -1,13 +1,11 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. import random -from collections import OrderedDict import fastavro import numpy as np import pandas as pd import pyarrow as pa -import pyorc import cudf from cudf.testing._utils import assert_eq @@ -41,40 +39,6 @@ cudf.dtype(" can result in incorrect dtype by pandas. - df = df.astype(dtypes) + orc_file = pa.orc.ORCFile(f) + records = [orc_file.read_stripe(i) for i in stripes] + pa_table = pa.Table.from_batches(records) + df = pa_table.to_pandas() return df diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 947659c290a..c041c7f4842 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf) # necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C # /opt/_internal" find_package(NumPy REQUIRED) -set(targets_using_numpy interop avro csv orc json parquet) -foreach(target IN LISTS targets_using_numpy) - target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") - # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. - # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") -endforeach() set(targets_using_dlpack interop) foreach(target IN LISTS targets_using_dlpack) @@ -107,8 +101,12 @@ if(${PYARROW_RESULT}) message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}") endif() -set(targets_using_arrow_headers interop avro csv orc json parquet) -foreach(target IN LISTS targets_using_arrow_headers) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f751d73b142..0edf9f8aa95 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -24,7 +24,7 @@ from cudf.utils.dtypes import _get_base_dtype from cpython.buffer cimport PyObject_CheckBuffer from libc.stdint cimport uintptr_t -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector @@ -47,7 +47,6 @@ from cudf._lib.cpp.column.column_factories cimport ( make_numeric_column, ) from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.scalar cimport DeviceScalar diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index feaf75ef237..1ec4719631e 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -1,7 +1,7 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector @@ -12,7 +12,6 @@ from cudf._lib.cpp.concatenate cimport ( concatenate_masks as libcudf_concatenate_masks, concatenate_tables as libcudf_concatenate_tables, ) -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.utils cimport ( data_from_unique_ptr, diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index f57bc15ed57..ea6ee76c14a 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -24,12 +24,13 @@ from cudf._lib.utils cimport table_view_from_columns, table_view_from_table from cudf._lib.reduce import minmax from cudf.core.abc import Serializable +from libcpp.functional cimport reference_wrapper +from libcpp.memory cimport make_unique + cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split cimport cudf._lib.cpp.copying as cpp_copying from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.lists.gather cimport ( segmented_gather as cpp_segmented_gather, ) diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index 20725c252fc..5637b55ac1c 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool +from libcpp.functional cimport reference_wrapper from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -9,7 +10,6 @@ from rmm._lib.device_buffer cimport device_buffer from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd index 2ecdf76842f..0266404fc50 100644 --- a/python/cudf/cudf/_lib/cpp/groupby.pxd +++ b/python/cudf/cudf/_lib/cpp/groupby.pxd @@ -1,6 +1,7 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.functional cimport reference_wrapper from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.vector cimport vector @@ -11,7 +12,6 @@ from cudf._lib.cpp.aggregation cimport ( ) from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.replace cimport replace_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index dd6f919a74d..d5ac8574fe4 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -4,12 +4,12 @@ from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view -from cudf._lib.cpp.libcpp.optional cimport optional from cudf._lib.cpp.types cimport data_type, size_type diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 2b92b9b58d3..cdd1bde0274 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -2,16 +2,16 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.functional cimport reference_wrapper from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view from cudf._lib.cpp.expressions cimport expression -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper -from cudf._lib.cpp.libcpp.optional cimport optional from cudf._lib.cpp.types cimport data_type, size_type @@ -90,10 +90,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_column_chunks_file_paths( vector[string] column_chunks_file_paths ) except + + void set_int96_timestamps( + bool enabled + ) except + + void set_utc_timestamps( + bool enabled + ) except + void set_row_group_size_bytes(size_t val) except + void set_row_group_size_rows(size_type val) except + void set_max_page_size_bytes(size_t val) except + void set_max_page_size_rows(size_type val) except + + void enable_write_v2_headers(bool val) except + + void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except + @staticmethod parquet_writer_options_builder builder( @@ -129,6 +137,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& int96_timestamps( bool enabled ) except + + parquet_writer_options_builder& utc_timestamps( + bool enabled + ) except + parquet_writer_options_builder& row_group_size_bytes( size_t val ) except + @@ -141,6 +152,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& max_page_size_rows( size_type val ) except + + parquet_writer_options_builder& write_v2_headers( + bool val + ) except + + parquet_writer_options_builder& dictionary_policy( + cudf_io_types.dictionary_policy val + ) except + parquet_writer_options build() except + @@ -172,10 +189,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_compression( cudf_io_types.compression_type compression ) except + + void set_int96_timestamps( + bool enabled + ) except + + void set_utc_timestamps( + bool enabled + ) except + void set_row_group_size_bytes(size_t val) except + void set_row_group_size_rows(size_type val) except + void set_max_page_size_bytes(size_t val) except + void set_max_page_size_rows(size_type val) except + + void enable_write_v2_headers(bool val) except + + void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except + @staticmethod chunked_parquet_writer_options_builder builder( @@ -199,6 +224,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: chunked_parquet_writer_options_builder& compression( cudf_io_types.compression_type compression ) except + + chunked_parquet_writer_options_builder& int96_timestamps( + bool enabled + ) except + + chunked_parquet_writer_options_builder& utc_timestamps( + bool enabled + ) except + chunked_parquet_writer_options_builder& row_group_size_bytes( size_t val ) except + @@ -211,6 +242,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: chunked_parquet_writer_options_builder& max_page_size_rows( size_type val ) except + + parquet_writer_options_builder& write_v2_headers( + bool val + ) except + + parquet_writer_options_builder& dictionary_policy( + cudf_io_types.dictionary_policy val + ) except + chunked_parquet_writer_options build() except + diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd index ba481d9a1d3..927c2118473 100644 --- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd +++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd @@ -2,9 +2,9 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string -from cudf._lib.cpp.libcpp.optional cimport optional from cudf._lib.cpp.table.table cimport table diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd index 01eaca82692..d8cc329b0a0 100644 --- a/python/cudf/cudf/_lib/cpp/io/types.pxd +++ b/python/cudf/cudf/_lib/cpp/io/types.pxd @@ -52,6 +52,11 @@ cdef extern from "cudf/io/types.hpp" \ STATISTICS_PAGE = 2, STATISTICS_COLUMN = 3, + ctypedef enum dictionary_policy: + NEVER = 0, + ADAPTIVE = 1, + ALWAYS = 2, + cdef cppclass column_name_info: string name vector[column_name_info] children diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd b/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.py b/python/cudf/cudf/_lib/cpp/libcpp/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd deleted file mode 100644 index f3e2d6d0878..00000000000 --- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - - -cdef extern from "" namespace "std" nogil: - cdef cppclass reference_wrapper[T]: - reference_wrapper() - reference_wrapper(T) diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd deleted file mode 100644 index 2178f1a940c..00000000000 --- a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr - - -cdef extern from "" namespace "std" nogil: - # The Cython standard header does not have except +, so C++ - # exceptions from make_unique are not caught and translated to - # Python ones. This is not perfectly ergonomic, we always have to - # wrap make_unique in move, but at least we can catch exceptions. - # See https://github.com/cython/cython/issues/5560 - unique_ptr[T] make_unique[T](...) except + diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd deleted file mode 100644 index a78c18f3f7a..00000000000 --- a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd +++ /dev/null @@ -1,50 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & -# AFFILIATES. All rights reserved. SPDX-License-Identifier: -# Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libcpp cimport bool - - -cdef extern from "" namespace "std" nogil: - cdef cppclass nullopt_t: - nullopt_t() - - cdef nullopt_t nullopt - - cdef cppclass optional[T]: - ctypedef T value_type - optional() - optional(nullopt_t) - optional(optional&) except + - optional(T&) except + - bool has_value() - T& value() - T& value_or[U](U& default_value) - void swap(optional&) - void reset() - T& emplace(...) - T& operator*() - optional& operator=(optional&) - optional& operator=[U](U&) - bool operator bool() - bool operator!() - bool operator==[U](optional&, U&) - bool operator!=[U](optional&, U&) - bool operator<[U](optional&, U&) - bool operator>[U](optional&, U&) - bool operator<=[U](optional&, U&) - bool operator>=[U](optional&, U&) - - optional[T] make_optional[T](...) except + diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd new file mode 100644 index 00000000000..e678e4e84db --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar + + +cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil: + + cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs": + pass + + cdef unique_ptr[bpe_merge_pairs] load_merge_pairs( + const column_view &merge_pairs + ) except + + + cdef unique_ptr[column] byte_pair_encoding( + const column_view &strings, + const bpe_merge_pairs &merge_pairs, + const string_scalar &separator + ) except + diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/cpp/strings/json.pxd index a017e1c5382..eed627c96b5 100644 --- a/python/cudf/cudf/_lib/cpp/strings/json.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/json.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -9,7 +9,7 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar -cdef extern from "cudf/strings/json.hpp" namespace "cudf::strings" nogil: +cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil: cdef cppclass get_json_object_options: get_json_object_options() except + # getters diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 81949dbaa20..3d96f59c4d6 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -10,6 +10,7 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.filling cimport calendrical_month_sequence +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar @@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset): + offset.kwds.get("months", 0) ) + cdef const scalar* c_start = start.c_value.get() with nogil: c_result = move(calendrical_month_sequence( n, - start.c_value.get()[0], + c_start[0], months )) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx index 8d7545ffe15..01a080f635f 100644 --- a/python/cudf/cudf/_lib/expressions.pyx +++ b/python/cudf/cudf/_lib/expressions.pyx @@ -4,12 +4,11 @@ from enum import Enum from cython.operator cimport dereference from libc.stdint cimport int64_t -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp cimport expressions as libcudf_exp -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.types cimport size_type # Necessary for proper casting, see below. diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index a26d820de6f..b3778e45cde 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -24,6 +24,8 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf._lib.scalar import as_device_scalar +from libcpp.functional cimport reference_wrapper + cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.aggregation cimport ( @@ -33,7 +35,6 @@ from cudf._lib.aggregation cimport ( make_groupby_scan_aggregation, ) from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.replace cimport replace_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table, table_view diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 639754fc54f..8fd2a409d90 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -4,14 +4,7 @@ from cpython cimport pycapsule from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from pyarrow.lib cimport ( - CScalar, - CTable, - pyarrow_unwrap_scalar, - pyarrow_unwrap_table, - pyarrow_wrap_scalar, - pyarrow_wrap_table, -) +from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table from cudf._lib.cpp.interop cimport ( DLManagedTensor, @@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport ( to_arrow as cpp_to_arrow, to_dlpack as cpp_to_dlpack, ) -from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport type_id -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) -from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype def from_dlpack(dlpack_capsule): @@ -199,79 +182,3 @@ def from_arrow(object input_table): c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) return columns_from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def to_arrow_scalar(DeviceScalar source_scalar): - """Convert a scalar to a PyArrow scalar. - - Parameters - ---------- - source_scalar : the scalar to convert - - Returns - ------- - pyarrow.lib.Scalar - """ - cdef vector[column_metadata] cpp_metadata = gather_metadata( - [("", source_scalar.dtype)] - ) - cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr() - - cdef shared_ptr[CScalar] cpp_arrow_scalar - with nogil: - cpp_arrow_scalar = cpp_to_arrow( - source_scalar_ptr[0], cpp_metadata[0] - ) - - return pyarrow_wrap_scalar(cpp_arrow_scalar) - - -@acquire_spill_lock() -def from_arrow_scalar(object input_scalar, output_dtype=None): - """Convert from PyArrow scalar to a cudf scalar. - - Parameters - ---------- - input_scalar : PyArrow scalar - output_dtype : output type to cast to, ignored except for decimals - - Returns - ------- - cudf._lib.DeviceScalar - """ - cdef shared_ptr[CScalar] cpp_arrow_scalar = ( - pyarrow_unwrap_scalar(input_scalar) - ) - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0])) - - cdef type_id ctype = c_result.get().type().id() - if ctype == type_id.DECIMAL128: - if output_dtype is None: - # Decimals must be cast to the cudf dtype of the right width - raise ValueError( - "Decimal scalars must be constructed with a dtype" - ) - - if isinstance(output_dtype, Decimal32Dtype): - c_result.reset( - new fixed_point_scalar[decimal32]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - elif isinstance(output_dtype, Decimal64Dtype): - c_result.reset( - new fixed_point_scalar[decimal64]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - # Decimal128Dtype is a no-op, no conversion needed. - - return DeviceScalar.from_unique_ptr(move(c_result), output_dtype) diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 416680aae24..378be978cc0 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -2,7 +2,7 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move @@ -11,7 +11,6 @@ from rmm._lib.device_buffer cimport device_buffer cimport cudf._lib.cpp.join as cpp_join from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type, type_id from cudf._lib.utils cimport table_view_from_columns diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx index 5b4538629f6..1f98140d9e4 100644 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ b/python/cudf/cudf/_lib/null_mask.pyx @@ -6,13 +6,12 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from cudf.core.buffer import acquire_spill_lock, as_buffer -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.null_mask cimport ( bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes, bitmask_and as cpp_bitmask_and, diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 515b9c1d6e4..d7cbdeb5bda 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx + byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx + ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx new file mode 100644 index 00000000000..cfc76afa8a5 --- /dev/null +++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx @@ -0,0 +1,50 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + + +from cudf.core.buffer import acquire_spill_lock + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.nvtext.byte_pair_encode cimport ( + bpe_merge_pairs as cpp_bpe_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, + load_merge_pairs as cpp_load_merge_pairs, +) +from cudf._lib.cpp.scalar.scalar cimport string_scalar +from cudf._lib.scalar cimport DeviceScalar + + +cdef class BPEMergePairs: + cdef unique_ptr[cpp_bpe_merge_pairs] c_obj + + def __cinit__(self, Column merge_pairs): + cdef column_view c_pairs = merge_pairs.view() + with nogil: + self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + + +@acquire_spill_lock() +def byte_pair_encoding( + Column strings, + BPEMergePairs merge_pairs, + object separator +): + cdef column_view c_strings = strings.view() + cdef DeviceScalar d_separator = separator.device_value + cdef const string_scalar* c_separator = d_separator\ + .get_raw_ptr() + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_byte_pair_encoding( + c_strings, + merge_pairs.c_obj.get()[0], + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 85fd25cf1a9..4acb1ce10b1 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move @@ -52,7 +52,6 @@ from cudf._lib.cpp.io.parquet cimport ( write_parquet as parquet_writer, ) from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type from cudf._lib.io.datasource cimport NativeFileDatasource @@ -321,6 +320,8 @@ def write_parquet( object max_page_size_rows=None, object partitions_info=None, object force_nullable_schema=False, + header_version="1.0", + use_dictionary=True, ): """ Cython function to call into libcudf API, see `write_parquet`. @@ -383,6 +384,18 @@ def write_parquet( tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) user_data.push_back(tmp_user_data) + if header_version not in ("1.0", "2.0"): + raise ValueError( + f"Invalid parquet header version: {header_version}. " + "Valid values are '1.0' and '2.0'" + ) + + dict_policy = ( + cudf_io_types.dictionary_policy.ALWAYS + if use_dictionary + else cudf_io_types.dictionary_policy.NEVER + ) + cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) @@ -399,6 +412,9 @@ def write_parquet( .compression(comp_type) .stats_level(stat_freq) .int96_timestamps(_int96_timestamps) + .write_v2_headers(header_version == "2.0") + .dictionary_policy(dict_policy) + .utc_timestamps(False) .build() ) if partitions_info is not None: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0ce42dc43ff..5185b2d4bb5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,10 +12,33 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx) +set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx + types.pyx utils.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) + +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" + OUTPUT_VARIABLE PYARROW_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() + +# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for +# more info +find_package(NumPy REQUIRED) +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. + # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index ba7822b0a54..7a35854392c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,9 +1,10 @@ # Copyright (c) 2023, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport copying +from . cimport copying, interop from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved @@ -12,7 +13,9 @@ from .types cimport DataType __all__ = [ "Column", "DataType", + "Scalar", "Table", "copying", "gpumemoryview", + "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 3edff9a53e8..72b74a57b87 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,16 +1,19 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from . import copying +from . import copying, interop from .column import Column from .gpumemoryview import gpumemoryview +from .scalar import Scalar from .table import Table from .types import DataType, TypeId __all__ = [ "Column", "DataType", + "Scalar", "Table", "TypeId", "copying", "gpumemoryview", + "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd new file mode 100644 index 00000000000..3a79e5425d4 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cudf._lib.cpp.interop cimport column_metadata + + +cdef class ColumnMetadata: + cdef public object name + cdef public object children_meta + cdef column_metadata to_libcudf(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx new file mode 100644 index 00000000000..0cdca275027 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -0,0 +1,23 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cudf._lib.cpp.interop cimport column_metadata + + +cdef class ColumnMetadata: + def __init__(self, name): + self.name = name + self.children_meta = [] + + cdef column_metadata to_libcudf(self): + """Convert to C++ column_metadata. + + Since this class is mutable and cheap, it is easier to create the C++ + object on the fly rather than have it directly backing the storage for + the Cython class. + """ + cdef column_metadata c_metadata + cdef ColumnMetadata child_meta + c_metadata.name = self.name.encode() + for child_meta in self.children_meta: + c_metadata.children_meta.push_back(child_meta.to_libcudf()) + return c_metadata diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd new file mode 100644 index 00000000000..09d853d832f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -0,0 +1,32 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa + +from rmm._lib.memory_resource cimport DeviceMemoryResource + +from cudf._lib.cpp.scalar.scalar cimport scalar + +from .interop cimport ColumnMetadata +from .types cimport DataType + + +cdef class Scalar: + cdef unique_ptr[scalar] c_obj + cdef DataType _data_type + + # Holds a reference to the DeviceMemoryResource used for allocation. + # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is + # needed for deallocation + cdef DeviceMemoryResource mr + + cdef const scalar* get(self) except * + + cpdef DataType type(self) + cpdef bool is_valid(self) + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) + + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx new file mode 100644 index 00000000000..04f588bd3e6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -0,0 +1,133 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cython cimport no_gc_clear +from cython.operator cimport dereference +from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.utility cimport move +from pyarrow cimport lib as pa + +from rmm._lib.memory_resource cimport get_current_device_resource + +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) +from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) + +from .interop cimport ColumnMetadata +from .types cimport DataType, type_id + + +# The DeviceMemoryResource attribute could be released prematurely +# by the gc if the Scalar is in a reference cycle. Removing the tp_clear +# function with the no_gc_clear decoration prevents that. See +# https://github.com/rapidsai/rmm/pull/931 for details. +@no_gc_clear +cdef class Scalar: + """A scalar value in device memory.""" + # Unlike for columns, libcudf does not support scalar views. All APIs that + # accept scalar values accept references to the owning object rather than a + # special view type. As a result, pylibcudf.Scalar has a simpler structure + # than pylibcudf.Column because it can be a true wrapper around a libcudf + # column + + def __cinit__(self, *args, **kwargs): + self.mr = get_current_device_resource() + + def __init__(self, pa.Scalar value=None): + # TODO: This case is not something we really want to + # support, but it here for now to ease the transition of + # DeviceScalar. + if value is not None: + raise ValueError("Scalar should be constructed with a factory") + + @staticmethod + def from_arrow(pa.Scalar value, DataType data_type=None): + # Allow passing a dtype, but only for the purpose of decimals for now + + cdef shared_ptr[pa.CScalar] cscalar = ( + pa.pyarrow_unwrap_scalar(value) + ) + cdef unique_ptr[scalar] c_result + + with nogil: + c_result = move(cpp_from_arrow(cscalar.get()[0])) + + cdef Scalar s = Scalar.from_libcudf(move(c_result)) + + if s.type().id() != type_id.DECIMAL128: + if data_type is not None: + raise ValueError( + "dtype may not be passed for non-decimal types" + ) + return s + + if data_type is None: + raise ValueError( + "Decimal scalars must be constructed with a dtype" + ) + + cdef type_id tid = data_type.id() + + if tid == type_id.DECIMAL32: + s.c_obj.reset( + new fixed_point_scalar[decimal32]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid == type_id.DECIMAL64: + s.c_obj.reset( + new fixed_point_scalar[decimal64]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid != type_id.DECIMAL128: + raise ValueError( + "Decimal scalars may only be cast to decimals" + ) + + return s + + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata): + cdef shared_ptr[pa.CScalar] c_result + cdef column_metadata c_metadata = metadata.to_libcudf() + + with nogil: + c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata)) + + return pa.pyarrow_wrap_scalar(c_result) + + cdef const scalar* get(self) except *: + return self.c_obj.get() + + cpdef DataType type(self): + """The type of data in the column.""" + return self._data_type + + cpdef bool is_valid(self): + """True if the scalar is valid, false if not""" + return self.get().is_valid() + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None): + """Construct a Scalar object from a libcudf scalar. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef Scalar s = Scalar.__new__(Scalar) + s.c_obj.swap(libcudf_scalar) + s._data_type = DataType.from_libcudf(s.get().type()) + return s diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 95f197b13eb..a9e2874232a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -16,3 +17,5 @@ cdef class Table: cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) cpdef list columns(self) + + cpdef pa.Table to_arrow(self, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index 720f9815bd6..c41eb82e4a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -1,15 +1,22 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector +from pyarrow cimport lib as pa from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) from cudf._lib.cpp.table.table cimport table from .column cimport Column +from .interop cimport ColumnMetadata cdef class Table: @@ -60,3 +67,27 @@ cdef class Table: cpdef list columns(self): return self._columns + + @staticmethod + def from_arrow(pa.Table pyarrow_table): + cdef shared_ptr[pa.CTable] ctable = ( + pa.pyarrow_unwrap_table(pyarrow_table) + ) + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_from_arrow(ctable.get()[0])) + + return Table.from_libcudf(move(c_result)) + + cpdef pa.Table to_arrow(self, list metadata): + cdef shared_ptr[pa.CTable] c_result + cdef vector[column_metadata] c_metadata + cdef ColumnMetadata meta + for meta in metadata: + c_metadata.push_back(meta.to_libcudf()) + + with nogil: + c_result = move(cpp_to_arrow(self.view(), c_metadata)) + + return pa.pyarrow_wrap_table(c_result) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 1deed60d67d..77733f59c3d 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,20 +1,19 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource +# TODO: Would like to remove this cimport, but it will require some more work +# to excise all C code in scalar.pyx that relies on using the C API of the +# pylibcudf Scalar underlying the DeviceScalar. +from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: - cdef unique_ptr[scalar] c_value - - # Holds a reference to the DeviceMemoryResource used for allocation. - # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is - # needed for deallocation - cdef DeviceMemoryResource mr + cdef pylibcudf.Scalar c_value cdef object _dtype diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 5ab286c5701..0b64c75f7b6 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -cimport cython - import copy import numpy as np @@ -13,17 +11,17 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from rmm._lib.memory_resource cimport get_current_device_resource - import cudf +from cudf._lib import pylibcudf from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ListDtype, StructDtype +from cudf.core.dtypes import ( + ListDtype, + StructDtype, + is_list_dtype, + is_struct_dtype, +) from cudf.core.missing import NA, NaT -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id - -from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar - cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, @@ -44,6 +42,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) +from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id def _replace_nested(obj, check, replacement): @@ -61,15 +60,44 @@ def _replace_nested(obj, check, replacement): _replace_nested(v, check, replacement) -# The DeviceMemoryResource attribute could be released prematurely -# by the gc if the DeviceScalar is in a reference cycle. Removing -# the tp_clear function with the no_gc_clear decoration prevents that. -# See https://github.com/rapidsai/rmm/pull/931 for details. -@cython.no_gc_clear +def gather_metadata(dtypes): + """Convert a dict of dtypes to a list of ColumnMetadata objects. + + The metadata is constructed recursively so that nested types are + represented as nested ColumnMetadata objects. + + Parameters + ---------- + dtypes : dict + A dict mapping column names to dtypes. + + Returns + ------- + List[ColumnMetadata] + A list of ColumnMetadata objects. + """ + out = [] + for name, dtype in dtypes.items(): + v = pylibcudf.interop.ColumnMetadata(name) + if is_struct_dtype(dtype): + v.children_meta = gather_metadata(dtype.fields) + elif is_list_dtype(dtype): + # Offsets column is unnamed and has no children + v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) + v.children_meta.extend( + gather_metadata({"": dtype.element_type}) + ) + out.append(v) + return out + + cdef class DeviceScalar: + # TODO: I think this should be removable, except that currently the way + # that from_unique_ptr is implemented is probably dereferencing this in an + # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.mr = get_current_device_resource() + self.c_value = pylibcudf.Scalar() def __init__(self, value, dtype): """ @@ -85,7 +113,7 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') + dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') if cudf.utils.utils.is_na_like(value): value = None @@ -108,10 +136,17 @@ cdef class DeviceScalar: pa_scalar = pa.scalar(value, type=pa_type) - # Note: This factory-like behavior in __init__ will be removed when - # migrating to pylibcudf. - cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype) - self.c_value.swap(obj.c_value) + data_type = None + if isinstance(dtype, cudf.core.dtypes.DecimalDtype): + tid = pylibcudf.TypeId.DECIMAL128 + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + tid = pylibcudf.TypeId.DECIMAL32 + elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): + tid = pylibcudf.TypeId.DECIMAL64 + data_type = pylibcudf.DataType(tid, -dtype.scale) + + self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type) + self._dtype = dtype def _to_host_scalar(self): is_datetime = self.dtype.kind == "M" @@ -119,7 +154,8 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA - ps = to_arrow_scalar(self) + metadata = gather_metadata({"": self.dtype})[0] + ps = self.c_value.to_arrow(metadata) if not ps.is_valid: return null_type @@ -158,13 +194,13 @@ cdef class DeviceScalar: return self._to_host_scalar() cdef const scalar* get_raw_ptr(self) except *: - return self.c_value.get() + return self.c_value.c_obj.get() cpdef bool is_valid(self): """ Returns if the Scalar is valid or not(i.e., ). """ - return self.get_raw_ptr()[0].is_valid() + return self.c_value.is_valid() def __repr__(self): if cudf.utils.utils.is_na_like(self.value): @@ -183,7 +219,7 @@ cdef class DeviceScalar: cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) cdef libcudf_types.data_type cdtype - s.c_value = move(ptr) + s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) cdtype = s.get_raw_ptr()[0].type() if dtype is not None: @@ -310,9 +346,9 @@ def _create_proxy_nat_scalar(dtype): if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True) + _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True) + _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) return result else: raise TypeError('NAT only valid for datetime and timedelta') diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index a5e87a456cb..fc11f047ab4 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -40,6 +40,14 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt index 434f79d3b5f..f55bb1fb780 100644 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt index 59a22c06e85..2f2063482af 100644 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -20,3 +20,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index 4d76cbfcdb5..808d1321b0b 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -1,13 +1,13 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.optional cimport make_optional from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp.io.timezone cimport ( make_timezone_transition_table as cpp_make_timezone_transition_table, ) -from cudf._lib.cpp.libcpp.optional cimport make_optional from cudf._lib.cpp.table.table cimport table from cudf._lib.utils cimport columns_from_unique_ptr diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py new file mode 100644 index 00000000000..ecf6ddd8e3b --- /dev/null +++ b/python/cudf/cudf/_version.py @@ -0,0 +1,20 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.resources + +__version__ = ( + importlib.resources.files("cudf").joinpath("VERSION").read_text().strip() +) +__git_commit__ = "" diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index f056a0fd592..91f3b2cd544 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -11,14 +11,20 @@ import weakref from collections import defaultdict from dataclasses import dataclass +from functools import partial from typing import Dict, List, Optional, Tuple import rmm.mr from cudf.core.buffer.spillable_buffer import SpillableBuffer from cudf.options import get_option +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.string import format_bytes +_spill_cudf_nvtx_annotate = partial( + _cudf_nvtx_annotate, domain="cudf_python-spill" +) + def get_traceback() -> str: """Pretty print current traceback to a string""" @@ -329,6 +335,7 @@ def buffers( ret = tuple(sorted(ret, key=lambda b: b.last_accessed)) return ret + @_spill_cudf_nvtx_annotate def spill_device_memory(self, nbytes: int) -> int: """Try to spill device memory diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 84fb2044c62..1856bec1876 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -20,6 +20,7 @@ get_ptr_and_size, host_memory_allocation, ) +from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate from cudf.utils.string import format_bytes if TYPE_CHECKING: @@ -291,8 +292,15 @@ def spill(self, target: str = "cpu") -> None: ) if (ptr_type, target) == ("gpu", "cpu"): - host_mem = host_memory_allocation(self.size) - rmm._lib.device_buffer.copy_ptr_to_host(self._ptr, host_mem) + with annotate( + message="SpillDtoH", + color=_get_color_for_nvtx("SpillDtoH"), + domain="cudf_python-spill", + ): + host_mem = host_memory_allocation(self.size) + rmm._lib.device_buffer.copy_ptr_to_host( + self._ptr, host_mem + ) self._ptr_desc["memoryview"] = host_mem self._ptr = 0 self._owner = None @@ -302,9 +310,15 @@ def spill(self, target: str = "cpu") -> None: # trigger a new call to this buffer's `spill()`. # Therefore, it is important that spilling-on-demand doesn't # try to unspill an already locked buffer! - dev_mem = rmm.DeviceBuffer.to_device( - self._ptr_desc.pop("memoryview") - ) + with annotate( + message="SpillHtoD", + color=_get_color_for_nvtx("SpillHtoD"), + domain="cudf_python-spill", + ): + + dev_mem = rmm.DeviceBuffer.to_device( + self._ptr_desc.pop("memoryview") + ) self._ptr = dev_mem.ptr self._owner = dev_mem assert self._size == dev_mem.size diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py new file mode 100644 index 00000000000..4c881022ecf --- /dev/null +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -0,0 +1,59 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from __future__ import annotations + +import cudf +from cudf._lib.nvtext.byte_pair_encode import ( + BPEMergePairs as cpp_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, +) + + +class BytePairEncoder: + """ + Given a merge pairs strings series, performs byte pair encoding on + a strings series using the provided separator. + + Parameters + ---------- + merges_pairs : str + Strings column of merge pairs + + Returns + ------- + BytePairEncoder + """ + + def __init__(self, merges_pair: "cudf.Series"): + self.merge_pairs = cpp_merge_pairs(merges_pair._column) + + def __call__(self, text, separator: str = " "): + """ + + Parameters + ---------- + text : cudf string series + The strings to be encoded. + + Returns + ------- + Encoded strings + + Examples + -------- + >>> import cudf + >>> from cudf.core.byte_pair_encoding import BytePairEncoder + >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t", + ... "c e", "es t", "en ce", "T h", "Th is", + ... "t est", "s ent", "t h", "th is"]) + >>> bpe = BytePairEncoder(mps) + >>> str_series = cudf.Series(['This is the sentence', 'thisisit']) + >>> bpe(str_series) + 0 This is a sent ence + 1 this is it + dtype: object + """ + sep = cudf.Scalar(separator, dtype="str") + result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) + + return cudf.Series(result) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a5e99abd79e..b4f65693d85 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2102,7 +2102,10 @@ def as_column( elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN - data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) + length = length or 1 + data = as_column( + pa.array(pd.Series([arbitrary] * length), from_pandas=True) + ) if dtype is not None: data = data.astype(dtype) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e3d4b20f141..16eead6ea81 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -97,11 +97,8 @@ min_scalar_type, numeric_normalize_types, ) -from cudf.utils.utils import ( - GetAttrGetItemMixin, - _cudf_nvtx_annotate, - _external_only_api, -) +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api _cupy_nan_methods_map = { "min": "nanmin", @@ -6495,6 +6492,8 @@ def to_parquet( max_page_size_rows=None, storage_options=None, return_metadata=False, + use_dictionary=True, + header_version="1.0", *args, **kwargs, ): @@ -6519,6 +6518,8 @@ def to_parquet( max_page_size_rows=max_page_size_rows, storage_options=storage_options, return_metadata=return_metadata, + use_dictionary=use_dictionary, + header_version=header_version, *args, **kwargs, ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index f7329d459e9..b2f0651d576 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -46,11 +46,8 @@ from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import can_convert_to_column, find_common_type -from cudf.utils.utils import ( - _array_ufunc, - _cudf_nvtx_annotate, - _warn_no_dask_cudf, -) +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf # TODO: It looks like Frame is missing a declaration of `copy`, need to add diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4b715e962e7..c48e5109ff2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -29,7 +29,8 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply -from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import GetAttrGetItemMixin # The three functions below return the quantiles [25%, 50%, 75%] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 51a7e9dfe8e..9f0c66a5c74 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -66,11 +66,8 @@ is_mixed_with_object_dtype, numeric_normalize_types, ) -from cudf.utils.utils import ( - _cudf_nvtx_annotate, - _warn_no_dask_cudf, - search_range, -) +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import _warn_no_dask_cudf, search_range def _lexsorted_equal_range( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fef62594fb8..4211a8c24bf 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -69,7 +69,8 @@ ) from cudf.utils import docutils from cudf.utils._numba import _CUDFNumbaConfig -from cudf.utils.utils import _cudf_nvtx_annotate, _warn_no_dask_cudf +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import _warn_no_dask_cudf doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 6a6e37180ca..b94f8f583f4 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -203,6 +203,7 @@ def perform_merge(self) -> cudf.DataFrame: if left_rows is not None else cudf.DataFrame._from_data({}) ) + del left_rows right_result = ( self.rhs._gather( GatherMap.from_column_unchecked( @@ -213,7 +214,7 @@ def perform_merge(self) -> cudf.DataFrame: if right_rows is not None else cudf.DataFrame._from_data({}) ) - + del right_rows result = cudf.DataFrame._from_data( *self._merge_results(left_result, right_result) ) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b4bbd0a8c3c..d0c8a513686 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -26,12 +26,8 @@ from cudf.core._compat import PANDAS_GE_150 from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index -from cudf.utils.utils import ( - NotIterable, - _cudf_nvtx_annotate, - _external_only_api, - _is_same_name, -) +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6fa5a8fd44b..04a7ed3abf7 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -92,7 +92,7 @@ is_mixed_with_object_dtype, to_cudf_compatible_scalar, ) -from cudf.utils.utils import _cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate def _format_percentile_names(percentiles): diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index d35762c8481..73464238dd4 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -20,7 +20,8 @@ ) from cudf.core.column import ColumnBase, as_column from cudf.core.frame import Frame -from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import NotIterable class SingleColumnFrame(Frame, NotIterable): diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index b18720f5db5..5dbcf455e33 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -28,7 +28,7 @@ _supported_dtypes_from_frame, ) from cudf.utils._numba import _CUDFNumbaConfig -from cudf.utils.utils import _cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate def _get_frame_groupby_type(dtype, index_dtype): diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 35a3f6c1ffd..7b7ac2b3070 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -39,7 +39,8 @@ STRING_TYPES, TIMEDELTA_TYPES, ) -from cudf.utils.utils import _cudf_nvtx_annotate, initfunc +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +from cudf.utils.utils import initfunc # Maximum size of a string column is 2 GiB _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get( diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index bacc0641639..764885dd7b6 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -11,7 +11,7 @@ from cudf.api.types import is_scalar from cudf.utils import ioutils from cudf.utils.dtypes import _maybe_convert_to_default_type -from cudf.utils.utils import _cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index f51952d23bf..d135a31438e 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -5,7 +5,6 @@ import pyarrow as pa from fsspec.utils import stringify_path -from pyarrow import orc as orc import cudf from cudf._lib import orc as liborc @@ -17,6 +16,8 @@ def _make_empty_df(filepath_or_buffer, columns): + from pyarrow import orc + orc_file = orc.ORCFile(filepath_or_buffer) schema = orc_file.schema col_names = schema.names if columns is None else columns @@ -150,6 +151,7 @@ def _parse_column_statistics(cs, column_statistics_blob): @ioutils.doc_read_orc_metadata() def read_orc_metadata(path): """{docstring}""" + from pyarrow import orc orc_file = orc.ORCFile(path) @@ -380,6 +382,7 @@ def read_orc( ) ) else: + from pyarrow import orc def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 81021a5d578..bcc24a85cf9 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -15,14 +15,14 @@ import numpy as np import pandas as pd -from pyarrow import dataset as ds, parquet as pq +from pyarrow import dataset as ds import cudf from cudf._lib import parquet as libparquet from cudf.api.types import is_list_like from cudf.core.column import build_categorical_column, column_empty, full from cudf.utils import ioutils -from cudf.utils.utils import _cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate BYTE_SIZES = { "kb": 1000, @@ -66,6 +66,8 @@ def _write_parquet( partitions_info=None, storage_options=None, force_nullable_schema=False, + header_version="1.0", + use_dictionary=True, ): if is_list_like(paths) and len(paths) > 1: if partitions_info is None: @@ -96,6 +98,8 @@ def _write_parquet( "max_page_size_rows": max_page_size_rows, "partitions_info": partitions_info, "force_nullable_schema": force_nullable_schema, + "header_version": header_version, + "use_dictionary": use_dictionary, } if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs): with ExitStack() as stack: @@ -204,7 +208,6 @@ def write_to_dataset( fs.mkdirs(root_path, exist_ok=True) if partition_cols is not None and len(partition_cols) > 0: - ( full_paths, metadata_file_paths, @@ -266,6 +269,7 @@ def write_to_dataset( @_cudf_nvtx_annotate def read_parquet_metadata(path): """{docstring}""" + import pyarrow.parquet as pq pq_file = pq.ParquetFile(path) @@ -303,7 +307,9 @@ def _process_dataset( # Convert filters to ds.Expression if filters is not None: - filters = pq.filters_to_expression(filters) + from pyarrow.parquet import filters_to_expression + + filters = filters_to_expression(filters) # Initialize ds.FilesystemDataset # TODO: Remove the if len(paths) workaround after following bug is fixed: @@ -709,7 +715,6 @@ def _parquet_to_frame( dataset_kwargs=None, **kwargs, ): - # If this is not a partitioned read, only need # one call to `_read_parquet` if not partition_keys: @@ -753,7 +758,7 @@ def _parquet_to_frame( ) ) # Add partition columns to the last DataFrame - for (name, value) in part_key: + for name, value in part_key: _len = len(dfs[-1]) if partition_categories and name in partition_categories: # Build the categorical column from `codes` @@ -866,6 +871,8 @@ def to_parquet( storage_options=None, return_metadata=False, force_nullable_schema=False, + header_version="1.0", + use_dictionary=True, *args, **kwargs, ): @@ -940,9 +947,13 @@ def to_parquet( partitions_info=partition_info, storage_options=storage_options, force_nullable_schema=force_nullable_schema, + header_version=header_version, + use_dictionary=use_dictionary, ) else: + import pyarrow.parquet as pq + if partition_offsets is not None: warnings.warn( "partition_offsets will be ignored when engine is not cudf" @@ -1040,7 +1051,6 @@ def _get_groups_and_offsets( preserve_index=False, **kwargs, ): - if not (set(df._data) - set(partition_cols)): warnings.warn("No data left to save outside partition columns") diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index eb2c7fa7ef6..0e19972f6e0 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -1,11 +1,11 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from io import BytesIO, StringIO import cudf from cudf._lib import text as libtext from cudf.utils import ioutils -from cudf.utils.utils import _cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index 02e8e960678..fb8569fa1d0 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -33,7 +33,7 @@ def profile(function_profile, line_profile, fn): elif function_profile: with Profiler() as profiler: yield fn - profiler.print_per_func_stats() + profiler.print_per_function_stats() else: yield fn diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index eb35c4adaaf..180d75d96e8 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -10,6 +10,7 @@ import importlib.abc import importlib.machinery import os +import pathlib import sys import threading import warnings @@ -554,9 +555,10 @@ def getattr_real_or_wrapped( frame = sys._getframe() # We cannot possibly be at the top level. assert frame.f_back - calling_module = frame.f_back.f_code.co_filename + calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename) use_real = any( - calling_module.startswith(path) for path in loader._denylist + calling_module.is_relative_to(path) + for path in loader._denylist ) try: if use_real: diff --git a/python/cudf/cudf/tests/data/parquet/bad_dict.parquet b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet new file mode 100644 index 00000000000..5008ac0b22b Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet differ diff --git a/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet b/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet new file mode 100644 index 00000000000..7f6006a75bf Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet differ diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index db0446d506c..0546638f388 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -193,12 +193,15 @@ def test_column_mixed_dtype(data, error): @pytest.mark.parametrize("nan_as_null", [True, False]) -def test_as_column_scalar_with_nan(nan_as_null): - size = 10 - scalar = np.nan - +@pytest.mark.parametrize( + "scalar", + [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)], + ids=repr, +) +@pytest.mark.parametrize("size", [1, 10]) +def test_as_column_scalar_with_nan(nan_as_null, scalar, size): expected = ( - cudf.Series([np.nan] * size, nan_as_null=nan_as_null) + cudf.Series([scalar] * size, nan_as_null=nan_as_null) .dropna() .to_numpy() ) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index e4b2af90448..0745e5aba48 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -6,6 +6,7 @@ import numpy as np import pyarrow as pa import pytest +from packaging import version import cudf from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn @@ -91,7 +92,15 @@ def test_from_arrow_max_precision_decimal32(): "to_dtype", [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)], ) -def test_typecast_from_float_to_decimal(data, from_dtype, to_dtype): +def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype): + request.applymarker( + pytest.mark.xfail( + condition=version.parse(pa.__version__) >= version.parse("13.0.0") + and from_dtype == np.dtype("float32") + and to_dtype.precision > 7, + reason="https://github.com/rapidsai/cudf/issues/14169", + ) + ) got = data.astype(from_dtype) pa_arr = got.to_arrow().cast( diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py new file mode 100644 index 00000000000..7dd25ebc500 --- /dev/null +++ b/python/cudf/cudf/tests/test_mvc.py @@ -0,0 +1,99 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +import subprocess +import sys + +import pytest + +IS_CUDA_11 = False +IS_CUDA_12 = False +try: + from ptxcompiler.patch import safe_get_versions +except ModuleNotFoundError: + from cudf.utils._ptxcompiler import safe_get_versions + +# do not test cuda 12 if pynvjitlink isn't present +HAVE_PYNVJITLINK = False +try: + import pynvjitlink # noqa: F401 + + HAVE_PYNVJITLINK = True +except ModuleNotFoundError: + pass + + +versions = safe_get_versions() +driver_version, runtime_version = versions + +if (11, 0) <= driver_version < (12, 0): + IS_CUDA_11 = True +if (12, 0) <= driver_version < (13, 0): + IS_CUDA_12 = True + + +TEST_BODY = """ +@numba.cuda.jit +def test_kernel(x): + id = numba.cuda.grid(1) + if id < len(x): + x[id] += 1 + +s = cudf.Series([1, 2, 3]) +with _CUDFNumbaConfig(): + test_kernel.forall(len(s))(s) +""" + +CUDA_11_TEST = ( + """ +import numba.cuda +import cudf +from cudf.utils._numba import _CUDFNumbaConfig, patch_numba_linker_cuda_11 + + +patch_numba_linker_cuda_11() +""" + + TEST_BODY +) + + +CUDA_12_TEST = ( + """ +import numba.cuda +import cudf +from cudf.utils._numba import _CUDFNumbaConfig +from pynvjitlink.patch import ( + patch_numba_linker as patch_numba_linker_pynvjitlink, +) + +patch_numba_linker_pynvjitlink() +""" + + TEST_BODY +) + + +@pytest.mark.parametrize( + "test", + [ + pytest.param( + CUDA_11_TEST, + marks=pytest.mark.skipif( + not IS_CUDA_11, + reason="Minor Version Compatibility test for CUDA 11", + ), + ), + pytest.param( + CUDA_12_TEST, + marks=pytest.mark.skipif( + not IS_CUDA_12 or not HAVE_PYNVJITLINK, + reason="Minor Version Compatibility test for CUDA 12", + ), + ), + ], +) +def test_numba_mvc(test): + cp = subprocess.run( + [sys.executable, "-c", test], + capture_output=True, + cwd="/", + ) + + assert cp.returncode == 0 diff --git a/python/cudf/cudf/tests/test_no_cuinit.py b/python/cudf/cudf/tests/test_no_cuinit.py index b142b0dab33..45d812fe9a2 100644 --- a/python/cudf/cudf/tests/test_no_cuinit.py +++ b/python/cudf/cudf/tests/test_no_cuinit.py @@ -66,6 +66,7 @@ def test_cudf_import_no_cuinit(cuda_gdb): env=env, capture_output=True, text=True, + cwd="/", ) cuInit_called = output.stdout.find("in cuInit ()") diff --git a/python/cudf/cudf/tests/test_numba_import.py b/python/cudf/cudf/tests/test_numba_import.py deleted file mode 100644 index 238a32a94fa..00000000000 --- a/python/cudf/cudf/tests/test_numba_import.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. -import subprocess -import sys - -import pytest - -IS_CUDA_11 = False -try: - from ptxcompiler.patch import NO_DRIVER, safe_get_versions - - versions = safe_get_versions() - if versions != NO_DRIVER: - driver_version, runtime_version = versions - if driver_version < (12, 0): - IS_CUDA_11 = True -except ModuleNotFoundError: - pass - -TEST_NUMBA_MVC_ENABLED = """ -import numba.cuda -import cudf -from cudf.utils._numba import _CUDFNumbaConfig, _patch_numba_mvc - - -_patch_numba_mvc() - -@numba.cuda.jit -def test_kernel(x): - id = numba.cuda.grid(1) - if id < len(x): - x[id] += 1 - -s = cudf.Series([1, 2, 3]) -with _CUDFNumbaConfig(): - test_kernel.forall(len(s))(s) -""" - - -@pytest.mark.skipif( - not IS_CUDA_11, reason="Minor Version Compatibility test for CUDA 11" -) -def test_numba_mvc_enabled_cuda_11(): - cp = subprocess.run( - [sys.executable, "-c", TEST_NUMBA_MVC_ENABLED], - capture_output=True, - cwd="/", - ) - assert cp.returncode == 0 diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 07aa5430f4f..7407da9c4ac 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -10,8 +10,6 @@ import numpy as np import pandas as pd import pyarrow as pa -import pyarrow.orc -import pyorc import pytest import cudf @@ -150,9 +148,11 @@ def test_orc_reader_trailing_nulls(datadir): ["TestOrcFile.testDate1900.orc", "TestOrcFile.testDate2038.orc"], ) def test_orc_reader_datetimestamp(datadir, inputfile, use_index): + from pyarrow import orc + path = datadir / inputfile try: - orcfile = pa.orc.ORCFile(path) + orcfile = orc.ORCFile(path) except pa.ArrowIOError as e: pytest.skip(".orc file is not found: %s" % e) @@ -295,28 +295,29 @@ def test_orc_read_rows(datadir, skiprows, num_rows): def test_orc_read_skiprows(): buff = BytesIO() - data = [ - True, - False, - True, - False, - None, - True, - True, - True, - False, - None, - False, - False, - True, - True, - True, - True, - ] - writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) - writer.writerows([(d,) for d in data]) - writer.close() - + df = pd.DataFrame( + { + "a": [ + True, + False, + True, + False, + None, + True, + True, + True, + False, + None, + False, + False, + True, + True, + True, + True, + ] + } + ) + df.to_orc(buff) # testing 10 skiprows due to a boolean specific bug fix that didn't # repro for other sizes of data skiprows = 10 @@ -605,6 +606,8 @@ def normalized_equals(value1, value2): @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @pytest.mark.parametrize("nrows", [1, 100, 6000000]) def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): + from pyarrow import orc + supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed if nrows == 6000000: @@ -623,7 +626,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): gdf.to_orc(fname.strpath, statistics=stats_freq) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(fname) + orc_file = orc.ORCFile(fname) ( file_stats, stripes_stats, @@ -677,6 +680,8 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): @pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) @pytest.mark.parametrize("nrows", [2, 100, 6000000]) def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): + from pyarrow import orc + np.random.seed(0) supported_stat_types = supported_numpy_dtypes + ["str"] # Can't write random bool columns until issue #6763 is fixed @@ -729,7 +734,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): expect = cudf.DataFrame(pd.concat([pdf1, pdf2]).reset_index(drop=True)) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(gdf_fname) + orc_file = orc.ORCFile(gdf_fname) ( file_stats, stripes_stats, @@ -782,6 +787,8 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): @pytest.mark.parametrize("nrows", [1, 100, 6000000]) def test_orc_write_bool_statistics(tmpdir, datadir, nrows): + from pyarrow import orc + # Make a dataframe gdf = cudf.DataFrame({"col_bool": gen_rand_series("bool", nrows)}) fname = tmpdir.join("gdf.orc") @@ -790,7 +797,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): gdf.to_orc(fname.strpath) # Read back written ORC's statistics - orc_file = pa.orc.ORCFile(fname) + orc_file = orc.ORCFile(fname) ( file_stats, stripes_stats, @@ -978,44 +985,12 @@ def test_orc_string_stream_offset_issue(): assert_eq(df, cudf.read_orc(buffer)) -# Data is generated using pyorc module def generate_list_struct_buff(size=100_000): rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() - schema = { - "lvl3_list": pyorc.Array(pyorc.Array(pyorc.Array(pyorc.BigInt()))), - "lvl1_list": pyorc.Array(pyorc.BigInt()), - "lvl1_struct": pyorc.Struct( - **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} - ), - "lvl2_struct": pyorc.Struct( - **{ - "a": pyorc.BigInt(), - "lvl1_struct": pyorc.Struct( - **{"c": pyorc.BigInt(), "d": pyorc.BigInt()} - ), - } - ), - "list_nests_struct": pyorc.Array( - pyorc.Array( - pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}) - ) - ), - "struct_nests_list": pyorc.Struct( - **{ - "struct": pyorc.Struct( - **{"a": pyorc.BigInt(), "b": pyorc.BigInt()} - ), - "list": pyorc.Array(pyorc.BigInt()), - } - ), - } - - schema = pyorc.Struct(**schema) - lvl3_list = [ rd.choice( [ @@ -1024,50 +999,57 @@ def generate_list_struct_buff(size=100_000): [ [ rd.choice([None, np.random.randint(1, 3)]) - for z in range(np.random.randint(1, 3)) + for _ in range(np.random.randint(1, 3)) ] - for z in range(np.random.randint(0, 3)) + for _ in range(np.random.randint(0, 3)) ] - for y in range(np.random.randint(0, 3)) + for _ in range(np.random.randint(0, 3)) ], ] ) - for x in range(size) + for _ in range(size) ] lvl1_list = [ [ rd.choice([None, np.random.randint(0, 3)]) - for y in range(np.random.randint(1, 4)) + for _ in range(np.random.randint(1, 4)) ] - for x in range(size) + for _ in range(size) ] lvl1_struct = [ - rd.choice([None, (np.random.randint(0, 3), np.random.randint(0, 3))]) - for x in range(size) + rd.choice( + [ + None, + {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)}, + ] + ) + for _ in range(size) ] lvl2_struct = [ rd.choice( [ None, - ( - rd.choice([None, np.random.randint(0, 3)]), - ( - rd.choice([None, np.random.randint(0, 3)]), - np.random.randint(0, 3), - ), - ), + {"a": rd.choice([None, np.random.randint(0, 3)])}, + { + "lvl1_struct": { + "c": rd.choice([None, np.random.randint(0, 3)]), + "d": np.random.randint(0, 3), + }, + }, ] ) - for x in range(size) + for _ in range(size) ] list_nests_struct = [ [ - [rd.choice(lvl1_struct), rd.choice(lvl1_struct)] - for y in range(np.random.randint(1, 4)) + {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} + for _ in range(np.random.randint(1, 4)) ] - for x in range(size) + for _ in range(size) + ] + struct_nests_list = [ + {"struct": lvl1_struct[x], "list": lvl1_list[x]} for x in range(size) ] - struct_nests_list = [(lvl1_struct[x], lvl1_list[x]) for x in range(size)] df = pd.DataFrame( { @@ -1080,15 +1062,7 @@ def generate_list_struct_buff(size=100_000): } ) - writer = pyorc.Writer(buff, schema, stripe_size=1024) - tuples = list( - map( - lambda x: (None,) if x[0] is pd.NA else x, - list(df.itertuples(index=False, name=None)), - ) - ) - writer.writerows(tuples) - writer.close() + df.to_orc(buff, engine="pyarrow", engine_kwargs={"stripe_size": 1024}) return buff @@ -1109,6 +1083,8 @@ def list_struct_buff(): @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000]) @pytest.mark.parametrize("use_index", [True, False]) def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): + from pyarrow import orc + gdf = cudf.read_orc( list_struct_buff, columns=columns, @@ -1116,7 +1092,7 @@ def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff): use_index=use_index, ) - pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read() + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() pyarrow_tbl = ( pyarrow_tbl[:num_rows] @@ -1155,111 +1131,96 @@ def test_pyspark_struct(datadir): def gen_map_buff(size=10000): from string import ascii_letters as al + from pyarrow import orc + rd = random.Random(1) np.random.seed(seed=1) buff = BytesIO() - schema = { - "lvl1_map": pyorc.Map(key=pyorc.String(), value=pyorc.BigInt()), - "lvl2_map": pyorc.Map( - key=pyorc.String(), value=pyorc.Array(pyorc.BigInt()) - ), - "lvl2_struct_map": pyorc.Map( - key=pyorc.String(), - value=pyorc.Struct(**{"a": pyorc.BigInt(), "b": pyorc.BigInt()}), - ), - } - - schema = pyorc.Struct(**schema) - - lvl1_map = [ - rd.choice( - [ - None, - [ - ( - rd.choice(al), - rd.choice([None, np.random.randint(1, 1500)]), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - lvl2_map = [ - rd.choice( - [ - None, + lvl1_map = pa.array( + [ + rd.choice( [ - ( - rd.choice(al), - rd.choice( - [ - None, - [ - rd.choice( - [None, np.random.randint(1, 1500)] - ) - for z in range(5) - ], - ] + None, + { + rd.choice(al): rd.choice( + [None, np.random.randint(1, 1500)] ), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - lvl2_struct_map = [ - rd.choice( - [ - None, + }, + ] + ) + for _ in range(size) + ], + type=pa.map_(pa.string(), pa.int64()), + ) + lvl2_map = pa.array( + [ + rd.choice( [ - ( - rd.choice(al), - rd.choice( - [ - None, - ( - rd.choice( - [None, np.random.randint(1, 1500)] - ), - rd.choice( - [None, np.random.randint(1, 1500)] - ), - ), - ] - ), - ) - for y in range(2) - ], - ] - ) - for x in range(size) - ] - - pdf = pd.DataFrame( - { - "lvl1_map": lvl1_map, - "lvl2_map": lvl2_map, - "lvl2_struct_map": lvl2_struct_map, - } + None, + *( + { + rd.choice(al): rd.choice( + [ + None, + [ + rd.choice( + [None, np.random.randint(1, 1500)] + ) + for _ in range(5) + ], + ] + ) + } + for _ in range(2) + ), + ] + ) + for _ in range(size) + ], + type=pa.map_(pa.string(), pa.list_(pa.int64())), ) - writer = pyorc.Writer( - buff, schema, stripe_size=1024, compression=pyorc.CompressionKind.NONE + lvl2_struct_map = pa.array( + [ + rd.choice( + [ + None, + *( + { + rd.choice(al): rd.choice( + [ + None, + { + "a": rd.choice( + [None, np.random.randint(1, 1500)] + ), + "b": rd.choice( + [None, np.random.randint(1, 1500)] + ), + }, + ] + ) + } + for _ in range(2) + ), + ] + ) + for _ in range(size) + ], + type=pa.map_( + pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) + ), ) - tuples = list( - map( - lambda x: (None,) if x[0] is pd.NA else x, - list(pdf.itertuples(index=False, name=None)), - ) + + pa_table = pa.Table.from_arrays( + [lvl1_map, lvl2_map, lvl2_struct_map], + ["lvl1_map", "lvl2_map", "lvl2_struct_map"], ) - writer.writerows(tuples) - writer.close() + orc.write_table( + pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" + ) return buff @@ -1274,7 +1235,9 @@ def gen_map_buff(size=10000): @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000]) @pytest.mark.parametrize("use_index", [True, False]) def test_map_type_read(columns, num_rows, use_index): - tbl = pa.orc.ORCFile(map_buff).read() + from pyarrow import orc + + tbl = orc.read_table(map_buff) lvl1_map = ( tbl["lvl1_map"] @@ -1460,18 +1423,22 @@ def test_writer_timestamp_stream_size(datadir, tmpdir): ], ) def test_no_row_group_index_orc_read(datadir, fname): + from pyarrow import orc + fpath = datadir / fname - expect = pa.orc.ORCFile(fpath).read() + expect = orc.ORCFile(fpath).read() got = cudf.read_orc(fpath) assert expect.equals(got.to_arrow()) def test_names_in_struct_dtype_nesting(datadir): + from pyarrow import orc + fname = datadir / "TestOrcFile.NestedStructDataFrame.orc" - expect = pa.orc.ORCFile(fname).read() + expect = orc.ORCFile(fname).read() got = cudf.read_orc(fname) # test dataframes @@ -1483,12 +1450,14 @@ def test_names_in_struct_dtype_nesting(datadir): def test_writer_lists_structs(list_struct_buff): + from pyarrow import orc + df_in = cudf.read_orc(list_struct_buff) buff = BytesIO() df_in.to_orc(buff) - pyarrow_tbl = pyarrow.orc.ORCFile(buff).read() + pyarrow_tbl = orc.ORCFile(buff).read() assert pyarrow_tbl.equals(df_in.to_arrow()) @@ -1527,12 +1496,10 @@ def test_statistics_sum_overflow(): minint64 = np.iinfo(np.int64).min buff = BytesIO() - with pyorc.Writer( - buff, - pyorc.Struct(a=pyorc.BigInt(), b=pyorc.BigInt(), c=pyorc.BigInt()), - ) as writer: - writer.write((maxint64, minint64, minint64)) - writer.write((1, -1, 1)) + df = pd.DataFrame( + {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} + ) + df.to_orc(buff) file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) assert file_stats[0]["a"].get("sum") is None @@ -1545,22 +1512,24 @@ def test_statistics_sum_overflow(): def test_empty_statistics(): + from pyarrow import orc + buff = BytesIO() - orc_schema = pyorc.Struct( - a=pyorc.BigInt(), - b=pyorc.Double(), - c=pyorc.String(), - d=pyorc.Decimal(11, 2), - e=pyorc.Date(), - f=pyorc.Timestamp(), - g=pyorc.Boolean(), - h=pyorc.Binary(), - i=pyorc.BigInt(), - # One column with non null value, else cudf/pyorc readers crash + pa_table = pa.Table.from_arrays( + [ + pa.array([None], type=pa.int64()), + pa.array([None], type=pa.float64()), + pa.array([None], type=pa.string()), + pa.array([None], type=pa.decimal128(11, 2)), + pa.array([None], type=pa.timestamp("ns")), + pa.array([None], type=pa.date64()), + pa.array([None], type=pa.bool_()), + pa.array([None], type=pa.binary()), + pa.array([1], type=pa.int64()), + ], + ["a", "b", "c", "d", "e", "f", "g", "h", "i"], ) - data = tuple([None] * (len(orc_schema.fields) - 1) + [1]) - with pyorc.Writer(buff, orc_schema) as writer: - writer.write(data) + orc.write_table(pa_table, buff) got = cudf.io.orc.read_orc_statistics([buff]) @@ -1615,6 +1584,8 @@ def test_select_nested(list_struct_buff, equivalent_columns): def test_orc_writer_rle_stream_size(datadir, tmpdir): + from pyarrow import orc + original = datadir / "TestOrcFile.int16.rle.size.orc" reencoded = tmpdir.join("int16_map.orc") @@ -1622,7 +1593,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir): df.to_orc(reencoded) # Segfaults when RLE stream sizes don't account for varint length - pa_out = pa.orc.ORCFile(reencoded).read() + pa_out = orc.ORCFile(reencoded).read() assert df.to_arrow().equals(pa_out) @@ -1642,11 +1613,13 @@ def test_empty_columns(): def test_orc_reader_zstd_compression(list_struct_buff): + from pyarrow import orc + expected = cudf.read_orc(list_struct_buff) # save with ZSTD compression buffer = BytesIO() - pyarrow_tbl = pyarrow.orc.ORCFile(list_struct_buff).read() - writer = pyarrow.orc.ORCWriter(buffer, compression="zstd") + pyarrow_tbl = orc.ORCFile(list_struct_buff).read() + writer = orc.ORCWriter(buffer, compression="zstd") writer.write(pyarrow_tbl) writer.close() try: @@ -1845,10 +1818,7 @@ def negative_timestamp_df(): @pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): buffer = BytesIO() - pyorc_table = pa.Table.from_pandas( - negative_timestamp_df.to_pandas(), preserve_index=False - ) - pyarrow.orc.write_table(pyorc_table, buffer) + negative_timestamp_df.to_orc(buffer) # We warn the user that this function will fall back to the CPU for reading # when the engine is pyarrow. @@ -1859,11 +1829,13 @@ def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): def test_orc_writer_negative_timestamp(negative_timestamp_df): + from pyarrow import orc + buffer = BytesIO() negative_timestamp_df.to_orc(buffer) assert_eq(negative_timestamp_df, pd.read_orc(buffer)) - assert_eq(negative_timestamp_df, pyarrow.orc.ORCFile(buffer).read()) + assert_eq(negative_timestamp_df, orc.ORCFile(buffer).read()) def test_orc_reader_apache_negative_timestamp(datadir): diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 9349e8c216f..af4d0294293 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1280,49 +1280,120 @@ def test_parquet_reader_v2(tmpdir, simple_pdf): simple_pdf.to_parquet(pdf_fname, data_page_version="2.0") assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) + cudf.from_pandas(simple_pdf).to_parquet(pdf_fname, header_version="2.0") + assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) + + +def test_parquet_delta_byte_array(datadir): + fname = datadir / "delta_byte_arr.parquet" + assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname)) + + +def delta_num_rows(): + return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000] + @pytest.mark.parametrize("nrows", [1, 100000]) @pytest.mark.parametrize("add_nulls", [True, False]) -def test_delta_binary(nrows, add_nulls, tmpdir): +@pytest.mark.parametrize( + "dtype", + [ + "int8", + "int16", + "int32", + "int64", + ], +) +def test_delta_binary(nrows, add_nulls, dtype, tmpdir): null_frequency = 0.25 if add_nulls else 0 # Create a pandas dataframe with random data of mixed types arrow_table = dg.rand_dataframe( dtypes_meta=[ { - "dtype": "int8", - "null_frequency": null_frequency, - "cardinality": nrows, - }, - { - "dtype": "int16", + "dtype": dtype, "null_frequency": null_frequency, "cardinality": nrows, }, + ], + rows=nrows, + seed=0, + use_threads=False, + ) + # Roundabout conversion to pandas to preserve nulls/data types + cudf_table = cudf.DataFrame.from_arrow(arrow_table) + test_pdf = cudf_table.to_pandas(nullable=True) + pdf_fname = tmpdir.join("pdfv2.parquet") + test_pdf.to_parquet( + pdf_fname, + version="2.6", + column_encoding="DELTA_BINARY_PACKED", + data_page_version="2.0", + data_page_size=64 * 1024, + engine="pyarrow", + use_dictionary=False, + ) + cdf = cudf.read_parquet(pdf_fname) + pcdf = cudf.from_pandas(test_pdf) + assert_eq(cdf, pcdf) + + # Write back out with cudf and make sure pyarrow can read it + cudf_fname = tmpdir.join("cudfv2.parquet") + pcdf.to_parquet( + cudf_fname, + compression=None, + header_version="2.0", + use_dictionary=False, + ) + + # FIXME(ets): should probably not use more bits than the data type + try: + cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) + except OSError as e: + if dtype == "int32" and nrows == 100000: + pytest.mark.xfail( + reason="arrow does not support 33-bit delta encoding" + ) + else: + raise e + else: + assert_eq(cdf2, cdf) + + +@pytest.mark.parametrize("nrows", delta_num_rows()) +@pytest.mark.parametrize("add_nulls", [True, False]) +@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"]) +def test_delta_byte_array_roundtrip(nrows, add_nulls, str_encoding, tmpdir): + null_frequency = 0.25 if add_nulls else 0 + + # Create a pandas dataframe with random data of mixed lengths + test_pdf = dg.rand_dataframe( + dtypes_meta=[ { - "dtype": "int32", + "dtype": "str", "null_frequency": null_frequency, "cardinality": nrows, + "max_string_length": 10, }, { - "dtype": "int64", + "dtype": "str", "null_frequency": null_frequency, "cardinality": nrows, + "max_string_length": 100, }, ], rows=nrows, seed=0, use_threads=False, - ) - # Roundabout conversion to pandas to preserve nulls/data types - cudf_table = cudf.DataFrame.from_arrow(arrow_table) - test_pdf = cudf_table.to_pandas(nullable=True) - pdf_fname = tmpdir.join("pdfv2.parquet") + ).to_pandas() + + pdf_fname = tmpdir.join("pdfdeltaba.parquet") test_pdf.to_parquet( pdf_fname, version="2.6", - column_encoding="DELTA_BINARY_PACKED", + column_encoding=str_encoding, data_page_version="2.0", + data_page_size=64 * 1024, engine="pyarrow", use_dictionary=False, ) @@ -1331,6 +1402,58 @@ def test_delta_binary(nrows, add_nulls, tmpdir): assert_eq(cdf, pcdf) +@pytest.mark.parametrize("nrows", delta_num_rows()) +@pytest.mark.parametrize("add_nulls", [True, False]) +@pytest.mark.parametrize("str_encoding", ["DELTA_BYTE_ARRAY"]) +def test_delta_struct_list(tmpdir, nrows, add_nulls, str_encoding): + # Struct> + lists_per_row = 3 + list_size = 4 + num_rows = nrows + include_validity = add_nulls + + def list_gen_wrapped(x, y): + return list_row_gen( + int_gen, x * list_size * lists_per_row, list_size, lists_per_row + ) + + def string_list_gen_wrapped(x, y): + return list_row_gen( + string_gen, + x * list_size * lists_per_row, + list_size, + lists_per_row, + include_validity, + ) + + data = struct_gen( + [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped], + 0, + num_rows, + include_validity, + ) + test_pdf = pa.Table.from_pydict({"sol": data}).to_pandas() + pdf_fname = tmpdir.join("pdfdeltaba.parquet") + test_pdf.to_parquet( + pdf_fname, + version="2.6", + column_encoding={ + "sol.col0": "DELTA_BINARY_PACKED", + "sol.col1": str_encoding, + "sol.col2.list.element.list.element": "DELTA_BINARY_PACKED", + "sol.col3.list.element.list.element": str_encoding, + }, + data_page_version="2.0", + data_page_size=64 * 1024, + engine="pyarrow", + use_dictionary=False, + ) + # sanity check to verify file is written properly + assert_eq(test_pdf, pd.read_parquet(pdf_fname)) + cdf = cudf.read_parquet(pdf_fname) + assert_eq(cdf, cudf.from_pandas(test_pdf)) + + @pytest.mark.parametrize( "data", [ @@ -1464,7 +1587,6 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): def test_multifile_parquet_folder(tmpdir): - test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2) test_pdf2 = make_pdf(nrows=20) expect = pd.concat([test_pdf1, test_pdf2]) @@ -2825,6 +2947,14 @@ def test_parquet_reader_unsupported_page_encoding(datadir): cudf.read_parquet(fname) +def test_parquet_reader_detect_bad_dictionary(datadir): + fname = datadir / "bad_dict.parquet" + + # expect a failure when reading the whole file + with pytest.raises(RuntimeError): + cudf.read_parquet(fname) + + @pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) @pytest.mark.parametrize("force_nullable_schema", [True, False]) def test_parquet_writer_schema_nullability(data, force_nullable_schema): diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index d54a2eabf22..b92f84b677c 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -533,3 +533,18 @@ def test_write_chunked_parquet(s3_base, s3so): actual.sort_values(["b"]).reset_index(drop=True), cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True), ) + + +def test_no_s3fs_on_cudf_import(): + import subprocess + import sys + + output = subprocess.check_output( + [ + sys.executable, + "-c", + "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)", + ], + cwd="/", + ) + assert output.strip() == b"False" diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index e565df8f3da..2dccd583b23 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf.core.byte_pair_encoding import BytePairEncoder from cudf.core.tokenize_vocabulary import TokenizeVocabulary from cudf.testing._utils import assert_eq @@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings(): actual = str1.str.jaccard_index(str2, jaccard_width) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "separator, input, results", + [ + (" ", "thetestsentence", "the test sent ence"), + ("_", "sentenceistest", "sent_ence_is_test"), + ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"), + ], +) +def test_byte_pair_encoding(separator, input, results): + pairs_table = cudf.Series( + [ + "t he", + "h e", + "e n", + "i t", + "i s", + "e s", + "en t", + "c e", + "es t", + "en ce", + "t h", + "h i", + "th is", + "t est", + "s i", + "s ent", + ] + ) + encoder = BytePairEncoder(pairs_table) + + strings = cudf.Series([input, None, "", input]) + + expected = cudf.Series([results, None, "", results]) + + actual = encoder(strings, separator) + assert type(expected) == type(actual) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index 09afb5680bd..bc0d6f37d89 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -7,6 +7,19 @@ from numba import config as numba_config +try: + from pynvjitlink.patch import ( + patch_numba_linker as patch_numba_linker_pynvjitlink, + ) +except ImportError: + + def patch_numba_linker_pynvjitlink(): + warnings.warn( + "CUDA Toolkit is newer than CUDA driver. " + "Numba features will not work in this configuration. " + ) + + CC_60_PTX_FILE = os.path.join( os.path.dirname(__file__), "../core/udf/shim_60.ptx" ) @@ -65,7 +78,7 @@ def _get_ptx_file(path, prefix): return regular_result[1] -def _patch_numba_mvc(): +def patch_numba_linker_cuda_11(): # Enable the config option for minor version compatibility numba_config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1 @@ -106,29 +119,19 @@ def _setup_numba(): versions = safe_get_versions() if versions != NO_DRIVER: driver_version, runtime_version = versions - if driver_version >= (12, 0) and runtime_version > driver_version: - warnings.warn( - f"Using CUDA toolkit version {runtime_version} with CUDA " - f"driver version {driver_version} requires minor version " - "compatibility, which is not yet supported for CUDA " - "driver versions 12.0 and above. It is likely that many " - "cuDF operations will not work in this state. Please " - f"install CUDA toolkit version {driver_version} to " - "continue using cuDF." - ) - else: - # Support MVC for all CUDA versions in the 11.x range - ptx_toolkit_version = _get_cuda_version_from_ptx_file( - CC_60_PTX_FILE - ) - # Numba thinks cubinlinker is only needed if the driver is older - # than the CUDA runtime, but when PTX files are present, it might - # also need to patch because those PTX files may be compiled by - # a CUDA version that is newer than the driver as well - if (driver_version < ptx_toolkit_version) or ( - driver_version < runtime_version - ): - _patch_numba_mvc() + ptx_toolkit_version = _get_cuda_version_from_ptx_file(CC_60_PTX_FILE) + + # MVC is required whenever any PTX is newer than the driver + # This could be the shipped PTX file or the PTX emitted by + # the version of NVVM on the user system, the latter aligning + # with the runtime version + if (driver_version < ptx_toolkit_version) or ( + driver_version < runtime_version + ): + if driver_version < (12, 0): + patch_numba_linker_cuda_11() + else: + patch_numba_linker_pynvjitlink() def _get_cuda_version_from_ptx_file(path): @@ -171,6 +174,8 @@ def _get_cuda_version_from_ptx_file(path): "7.8": (11, 8), "8.0": (12, 0), "8.1": (12, 1), + "8.2": (12, 2), + "8.3": (12, 3), } cuda_ver = ver_map.get(version) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 91925bf3c0c..6641bd8290a 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -13,7 +13,6 @@ import pandas as pd from fsspec.core import get_fs_token_paths from pyarrow import PythonFile as ArrowPythonFile -from pyarrow.fs import FSSpecHandler, PyFileSystem from pyarrow.lib import NativeFile from cudf.utils.docutils import docfmt_partial @@ -288,6 +287,14 @@ include the file path metadata (relative to `root_path`). To request metadata binary blob when using with ``partition_cols``, Pass ``return_metadata=True`` instead of specifying ``metadata_file_path`` +use_dictionary : bool, default True + When ``False``, prevents the use of dictionary encoding for Parquet page + data. When ``True``, dictionary encoding is preferred when not disabled due + to dictionary size constraints. +header_version : {{'1.0', '2.0'}}, default "1.0" + Controls whether to use version 1.0 or version 2.0 page headers when + encoding. Version 1.0 is more portable, but version 2.0 enables the + use of newer encoding schemes. force_nullable_schema : bool, default False. If True, writes all columns as `null` in schema. If False, columns are written as `null` if they contain null values, @@ -1630,6 +1637,15 @@ def _open_remote_files( for path, rgs in zip(paths, row_groups) ] + # Avoid top-level pyarrow.fs import. + # Importing pyarrow.fs initializes a S3 SDK with a finalizer + # that runs atexit. In some circumstances it appears this + # runs a call into a logging system that is already shutdown. + # To avoid this, we only import this subsystem if it is + # really needed. + # See https://github.com/aws/aws-sdk-cpp/issues/2681 + from pyarrow.fs import FSSpecHandler, PyFileSystem + # Default open - Use pyarrow filesystem API pa_fs = PyFileSystem(FSSpecHandler(fs)) return [ diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py new file mode 100644 index 00000000000..a4404e51232 --- /dev/null +++ b/python/cudf/cudf/utils/nvtx_annotation.py @@ -0,0 +1,30 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import hashlib +from functools import partial + +from nvtx import annotate + +_NVTX_COLORS = ["green", "blue", "purple", "rapids"] + + +def _get_color_for_nvtx(name): + m = hashlib.sha256() + m.update(name.encode()) + hash_value = int(m.hexdigest(), 16) + idx = hash_value % len(_NVTX_COLORS) + return _NVTX_COLORS[idx] + + +def _cudf_nvtx_annotate(func, domain="cudf_python"): + """Decorator for applying nvtx annotations to methods in cudf.""" + return annotate( + message=func.__qualname__, + color=_get_color_for_nvtx(func.__qualname__), + domain=domain, + )(func) + + +_dask_cudf_nvtx_annotate = partial( + _cudf_nvtx_annotate, domain="dask_cudf_python" +) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index ffc3c29c996..ec5693e14d2 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -2,16 +2,13 @@ import decimal import functools -import hashlib import os import traceback import warnings -from functools import partial from typing import FrozenSet, Set, Union import numpy as np import pandas as pd -from nvtx import annotate import rmm @@ -120,8 +117,6 @@ def _array_ufunc(obj, ufunc, method, inputs, kwargs): "__ge__", } -_NVTX_COLORS = ["green", "blue", "purple", "rapids"] - # The test root is set by pytest to support situations where tests are run from # a source tree on a built version of cudf. NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") @@ -343,28 +338,6 @@ def is_na_like(obj): return obj is None or obj is cudf.NA or obj is cudf.NaT -def _get_color_for_nvtx(name): - m = hashlib.sha256() - m.update(name.encode()) - hash_value = int(m.hexdigest(), 16) - idx = hash_value % len(_NVTX_COLORS) - return _NVTX_COLORS[idx] - - -def _cudf_nvtx_annotate(func, domain="cudf_python"): - """Decorator for applying nvtx annotations to methods in cudf.""" - return annotate( - message=func.__qualname__, - color=_get_color_for_nvtx(func.__qualname__), - domain=domain, - )(func) - - -_dask_cudf_nvtx_annotate = partial( - _cudf_nvtx_annotate, domain="dask_cudf_python" -) - - def _warn_no_dask_cudf(fn): @functools.wraps(fn) def wrapper(self): diff --git a/python/cudf/cudf_pandas_tests/data/profile_basic.py b/python/cudf/cudf_pandas_tests/data/profile_basic.py new file mode 100644 index 00000000000..f7b4ba89ce7 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/data/profile_basic.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import pandas as pd + +df = pd.DataFrame( + { + "size": [10, 11, 12, 10, 11, 12, 10, 6, 11, 10], + "total_bill": [100, 200, 100, 200, 100, 100, 200, 50, 10, 560], + } +) +df["size"].value_counts() +df.groupby("size").total_bill.mean() +df.apply(list, axis=1) diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index a947d67b724..4921446ab6b 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -2,6 +2,9 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import os +import subprocess + from cudf.pandas import LOADED, Profiler if not LOADED: @@ -68,3 +71,41 @@ def test_profiler_fast_slow_name_mismatch(): with Profiler(): df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) df.iloc[0, 1] = "foo" + + +def test_profiler_commandline(): + data_directory = os.path.dirname(os.path.abspath(__file__)) + # Create a copy of the current environment variables + env = os.environ.copy() + # Setting the 'COLUMNS' environment variable to a large number + # because the terminal output shouldn't be compressed for + # text validations below. + env["COLUMNS"] = "10000" + + sp_completed = subprocess.run( + [ + "python", + "-m", + "cudf.pandas", + "--profile", + data_directory + "/data/profile_basic.py", + ], + capture_output=True, + text=True, + env=env, + ) + assert sp_completed.returncode == 0 + output = sp_completed.stdout + + for string in [ + "Total time", + "Stats", + "Function", + "GPU ncalls", + "GPU cumtime", + "GPU percall", + "CPU ncalls", + "CPU cumtime", + "CPU percall", + ]: + assert string in output diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index a1fec83c1b9..b38970271d7 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -4,12 +4,12 @@ build-backend = "setuptools.build_meta" requires = [ "cmake>=3.26.4", - "cython>=3.0.0", + "cython>=3.0.3", "ninja", "numpy>=1.21,<1.25", "protoc-wheel", - "pyarrow==12.0.1.*", - "rmm==23.10.*", + "pyarrow==14.0.1.*", + "rmm==23.12.*", "scikit-build>=0.13.1", "setuptools", "wheel", @@ -17,7 +17,7 @@ requires = [ [project] name = "cudf" -version = "23.10.00" +dynamic = ["version"] description = "cuDF - GPU Dataframe" readme = { file = "README.md", content-type = "text/markdown" } authors = [ @@ -38,9 +38,9 @@ dependencies = [ "pandas>=1.3,<1.6.0dev0", "protobuf>=4.21,<5", "ptxcompiler", - "pyarrow==12.*", + "pyarrow>=14.0.1,<15.0.0a0", "rich", - "rmm==23.10.*", + "rmm==23.12.*", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -60,7 +60,6 @@ test = [ "hypothesis", "mimesis>=4.1.0", "msgpack", - "pyorc", "pytest", "pytest-benchmark", "pytest-cases", @@ -127,6 +126,9 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/" [tool.setuptools] license-files = ["LICENSE"] +[tool.setuptools.dynamic] +version = {file = "cudf/VERSION"} + [tool.isort] line_length = 79 multi_line_output = 3 diff --git a/python/cudf/setup.py b/python/cudf/setup.py index 96b91b4ccc0..984cd63a7c9 100644 --- a/python/cudf/setup.py +++ b/python/cudf/setup.py @@ -6,6 +6,8 @@ packages = find_packages(include=["cudf*", "udf_cpp*"]) setup( packages=packages, - package_data={key: ["*.pxd", "*.hpp", "*.cuh"] for key in packages}, + package_data={ + key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages + }, zip_safe=False, ) diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt new file mode 100644 index 00000000000..d55c3fdc076 --- /dev/null +++ b/python/cudf_kafka/CMakeLists.txt @@ -0,0 +1,47 @@ +# ============================================================================= +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) + +set(cudf_kafka_version 23.12.00) + +include(../../fetch_rapids.cmake) + +project( + cudf-kafka-python + VERSION ${cudf_kafka_version} + LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C + # language to be enabled here. The test project that is built in scikit-build to verify + # various linking options for the python library is hardcoded to build with C, so until + # that is fixed we need to keep C. + C CXX +) + +find_package(cudf_kafka ${cudf_kafka_version} REQUIRED) + +if(NOT cudf_kafka_FOUND) + message( + FATAL_ERROR + "cudf_kafka package not found. cudf_kafka C++ is required to build this Python package." + ) +endif() + +include(rapids-cython) +rapids_cython_init() + +add_subdirectory(cudf_kafka/_lib) + +if(DEFINED cython_lib_dir) + rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}") +endif() diff --git a/python/cudf_kafka/LICENSE b/python/cudf_kafka/LICENSE new file mode 120000 index 00000000000..30cff7403da --- /dev/null +++ b/python/cudf_kafka/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/cudf_kafka/README.md b/python/cudf_kafka/README.md new file mode 120000 index 00000000000..fe840054137 --- /dev/null +++ b/python/cudf_kafka/README.md @@ -0,0 +1 @@ +../../README.md \ No newline at end of file diff --git a/python/cudf_kafka/cudf_kafka/VERSION b/python/cudf_kafka/cudf_kafka/VERSION new file mode 120000 index 00000000000..d62dc733efd --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/VERSION @@ -0,0 +1 @@ +../../../VERSION \ No newline at end of file diff --git a/python/cudf_kafka/cudf_kafka/__init__.py b/python/cudf_kafka/cudf_kafka/__init__.py index e69de29bb2d..43a91af9cf5 100644 --- a/python/cudf_kafka/cudf_kafka/__init__.py +++ b/python/cudf_kafka/cudf_kafka/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +from ._version import __git_commit__, __version__ diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt new file mode 100644 index 00000000000..3262b7d5ebe --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt @@ -0,0 +1,62 @@ +# ============================================================================= +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources kafka.pyx) +set(linked_libraries cudf_kafka::cudf_kafka) + +rapids_cython_create_modules( + CXX ASSOCIATED_TARGETS cudf_kafka + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" +) + +# TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was +# fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in +# CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24. +# find_package(Python REQUIRED COMPONENTS Development NumPy) + +# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake +# inside the manylinux images used to build wheels because manylinux images do not contain +# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should +# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try +# switching back, but it may not work if that implicitly still requires Python libraries. In that +# case we'll need to follow up with the CMake team to remove that dependency. The stopgap solution +# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static +# libs to be found, but that should be a last resort since it implies a dependency that isn't really +# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C +# /opt/_internal" +find_package(NumPy REQUIRED) + +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" + OUTPUT_VARIABLE PYARROW_INCLUDE_DIR + ERROR_VARIABLE PYARROW_ERROR + RESULT_VARIABLE PYARROW_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +if(${PYARROW_RESULT}) + message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}") +endif() + +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd index ca729c62512..068837d04ee 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd @@ -11,12 +11,12 @@ from cudf._lib.cpp.io.datasource cimport datasource from cudf._lib.io.datasource cimport Datasource -cdef extern from "kafka_callback.hpp" \ +cdef extern from "cudf_kafka/kafka_callback.hpp" \ namespace "cudf::io::external::kafka" nogil: ctypedef object (*python_callable_type)() -cdef extern from "kafka_consumer.hpp" \ +cdef extern from "cudf_kafka/kafka_consumer.hpp" \ namespace "cudf::io::external::kafka" nogil: cpdef cppclass kafka_consumer: diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx index 4d732478723..2fbaacff7c6 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx @@ -3,12 +3,11 @@ from libc.stdint cimport int32_t, int64_t from libcpp cimport bool, nullptr from libcpp.map cimport map -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp.io.datasource cimport datasource -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf_kafka._lib.kafka cimport kafka_consumer diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py new file mode 100644 index 00000000000..5adab566da0 --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/_version.py @@ -0,0 +1,23 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.resources + +__version__ = ( + importlib.resources.files("cudf_kafka") + .joinpath("VERSION") + .read_text() + .strip() +) +__git_commit__ = "" diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 386cdc32ab1..15431161d75 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -3,16 +3,17 @@ [build-system] requires = [ - "cython>=3.0.0", + "cython>=3.0.3", "numpy>=1.21,<1.25", - "pyarrow==12.0.1.*", + "pyarrow==14.0.1.*", + "scikit-build>=0.13.1", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project] name = "cudf_kafka" -version = "23.10.00" +dynamic = ["version"] description = "cuDF Kafka Datasource" readme = { file = "README.md", content-type = "text/markdown" } authors = [ @@ -21,7 +22,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==23.10.*", + "cudf==23.12.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.optional-dependencies] @@ -38,6 +39,9 @@ Documentation = "https://docs.rapids.ai/api/cudf/stable/" [tool.setuptools] license-files = ["LICENSE"] +[tool.setuptools.dynamic] +version = {file = "cudf_kafka/VERSION"} + [tool.isort] line_length = 79 multi_line_output = 3 diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py index d955d95858a..6a99e9ed968 100644 --- a/python/cudf_kafka/setup.py +++ b/python/cudf_kafka/setup.py @@ -1,96 +1,13 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. -import os -import shutil -import sysconfig -from distutils.sysconfig import get_python_lib - -import numpy as np -import pyarrow as pa -from Cython.Build import cythonize -from setuptools import find_packages, setup -from setuptools.extension import Extension - -cython_files = ["cudf_kafka/_lib/*.pyx"] - -CUDA_HOME = os.environ.get("CUDA_HOME", False) -if not CUDA_HOME: - path_to_cuda_gdb = shutil.which("cuda-gdb") - if path_to_cuda_gdb is None: - raise OSError( - "Could not locate CUDA. " - "Please set the environment variable " - "CUDA_HOME to the path to the CUDA installation " - "and try again." - ) - CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb)) - -if not os.path.isdir(CUDA_HOME): - raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}") - -cuda_include_dir = os.path.join(CUDA_HOME, "include") - -CUDF_ROOT = os.environ.get( - "CUDF_ROOT", - os.path.abspath( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), "../../cpp/build/" - ) - ), -) -CUDF_KAFKA_ROOT = os.environ.get( - "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build" -) - -try: - nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0") -except Exception: - nthreads = 0 - -extensions = [ - Extension( - "*", - sources=cython_files, - include_dirs=[ - os.path.abspath(os.path.join(CUDF_ROOT, "../include/cudf")), - os.path.abspath(os.path.join(CUDF_ROOT, "../include")), - os.path.abspath( - os.path.join(CUDF_ROOT, "../libcudf_kafka/include/cudf_kafka") - ), - os.path.join(CUDF_ROOT, "include"), - os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"), - os.path.join( - os.path.dirname(sysconfig.get_path("include")), - "rapids/libcudacxx", - ), - os.path.dirname(sysconfig.get_path("include")), - np.get_include(), - pa.get_include(), - cuda_include_dir, - ], - library_dirs=( - [ - get_python_lib(), - os.path.join(os.sys.prefix, "lib"), - CUDF_KAFKA_ROOT, - ] - ), - libraries=["cudf", "cudf_kafka"], - language="c++", - extra_compile_args=["-std=c++17", "-DFMT_HEADER_ONLY=1"], - ) -] +# Copyright (c) 2018-2023, NVIDIA CORPORATION. +from setuptools import find_packages +from skbuild import setup packages = find_packages(include=["cudf_kafka*"]) + setup( - # Include the separately-compiled shared library - ext_modules=cythonize( - extensions, - nthreads=nthreads, - compiler_directives=dict( - profile=False, language_level=3, embedsignature=True - ), - ), packages=packages, - package_data={key: ["*.pxd"] for key in packages}, + package_data={ + key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages + }, zip_safe=False, ) diff --git a/python/custreamz/custreamz/VERSION b/python/custreamz/custreamz/VERSION new file mode 120000 index 00000000000..d62dc733efd --- /dev/null +++ b/python/custreamz/custreamz/VERSION @@ -0,0 +1 @@ +../../../VERSION \ No newline at end of file diff --git a/python/custreamz/custreamz/__init__.py b/python/custreamz/custreamz/__init__.py index 52be76aab1f..3f11da14684 100644 --- a/python/custreamz/custreamz/__init__.py +++ b/python/custreamz/custreamz/__init__.py @@ -1,3 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +from ._version import __git_commit__, __version__ from .kafka import Consumer diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py new file mode 100644 index 00000000000..0f545f95f2b --- /dev/null +++ b/python/custreamz/custreamz/_version.py @@ -0,0 +1,23 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.resources + +__version__ = ( + importlib.resources.files("custreamz") + .joinpath("VERSION") + .read_text() + .strip() +) +__git_commit__ = "" diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 47ade91b5eb..2d0059d5aa9 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -9,7 +9,7 @@ requires = [ [project] name = "custreamz" -version = "23.10.00" +dynamic = ["version"] description = "cuStreamz - GPU Accelerated Streaming" readme = { file = "README.md", content-type = "text/markdown" } authors = [ @@ -19,8 +19,8 @@ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ "confluent-kafka>=1.9.0,<1.10.0a0", - "cudf==23.10.*", - "cudf_kafka==23.10.*", + "cudf==23.12.*", + "cudf_kafka==23.12.*", "streamz", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -48,6 +48,9 @@ Homepage = "https://github.com/rapidsai/cudf" license-files = ["LICENSE"] zip-safe = false +[tool.setuptools.dynamic] +version = {file = "custreamz/VERSION"} + [tools.setuptools.packages.find] include = [ "custreamz", diff --git a/python/custreamz/setup.py b/python/custreamz/setup.py index 2fa45ac8087..04943bf88e2 100644 --- a/python/custreamz/setup.py +++ b/python/custreamz/setup.py @@ -2,4 +2,6 @@ from setuptools import setup -setup() +setup( + package_data={"custreamz": ["VERSION"]}, +) diff --git a/python/dask_cudf/dask_cudf/VERSION b/python/dask_cudf/dask_cudf/VERSION new file mode 120000 index 00000000000..d62dc733efd --- /dev/null +++ b/python/dask_cudf/dask_cudf/VERSION @@ -0,0 +1 @@ +../../../VERSION \ No newline at end of file diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 6952c3d5882..c152a9e6a81 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -5,6 +5,7 @@ import cudf from . import backends +from ._version import __git_commit__, __version__ from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe from .groupby import groupby_agg from .io import read_csv, read_json, read_orc, read_text, to_orc @@ -14,8 +15,6 @@ except ImportError: pass -__version__ = "23.10.00" - __all__ = [ "DataFrame", "Series", diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py new file mode 100644 index 00000000000..0dd62854a4e --- /dev/null +++ b/python/dask_cudf/dask_cudf/_version.py @@ -0,0 +1,23 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.resources + +__version__ = ( + importlib.resources.files("dask_cudf") + .joinpath("VERSION") + .read_text() + .strip() +) +__git_commit__ = "" diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 2be256f85e8..387643587d1 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -12,6 +12,7 @@ import dask.dataframe as dd from dask import config +from dask.array.dispatch import percentile_lookup from dask.dataframe.backends import ( DataFrameBackendEntrypoint, PandasBackendEntrypoint, @@ -42,7 +43,7 @@ import cudf from cudf.api.types import is_string_dtype -from cudf.utils.utils import _dask_cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate from .core import DataFrame, Index, Series @@ -320,56 +321,45 @@ def get_grouper_cudf(obj): return cudf.core.groupby.Grouper -try: - try: - from dask.array.dispatch import percentile_lookup - except ImportError: - from dask.dataframe.dispatch import ( - percentile_dispatch as percentile_lookup, - ) - - @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex)) - @_dask_cudf_nvtx_annotate - def percentile_cudf(a, q, interpolation="linear"): - # Cudf dispatch to the equivalent of `np.percentile`: - # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html - a = cudf.Series(a) - # a is series. - n = len(a) - if not len(a): - return None, n - if isinstance(q, Iterator): - q = list(q) - - if cudf.api.types.is_categorical_dtype(a.dtype): - result = cp.percentile(a.cat.codes, q, interpolation=interpolation) - - return ( - pd.Categorical.from_codes( - result, a.dtype.categories, a.dtype.ordered - ), - n, - ) - if np.issubdtype(a.dtype, np.datetime64): - result = a.quantile( - [i / 100.0 for i in q], interpolation=interpolation - ) +@percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex)) +@_dask_cudf_nvtx_annotate +def percentile_cudf(a, q, interpolation="linear"): + # Cudf dispatch to the equivalent of `np.percentile`: + # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html + a = cudf.Series(a) + # a is series. + n = len(a) + if not len(a): + return None, n + if isinstance(q, Iterator): + q = list(q) + + if cudf.api.types.is_categorical_dtype(a.dtype): + result = cp.percentile(a.cat.codes, q, interpolation=interpolation) - if q[0] == 0: - # https://github.com/dask/dask/issues/6864 - result[0] = min(result[0], a.min()) - return result.to_pandas(), n - if not np.issubdtype(a.dtype, np.number): - interpolation = "nearest" return ( - a.quantile( - [i / 100.0 for i in q], interpolation=interpolation - ).to_pandas(), + pd.Categorical.from_codes( + result, a.dtype.categories, a.dtype.ordered + ), n, ) + if np.issubdtype(a.dtype, np.datetime64): + result = a.quantile( + [i / 100.0 for i in q], interpolation=interpolation + ) -except ImportError: - pass + if q[0] == 0: + # https://github.com/dask/dask/issues/6864 + result[0] = min(result[0], a.min()) + return result.to_pandas(), n + if not np.issubdtype(a.dtype, np.number): + interpolation = "nearest" + return ( + a.quantile( + [i / 100.0 for i in q], interpolation=interpolation + ).to_pandas(), + n, + ) @pyarrow_schema_dispatch.register((cudf.DataFrame,)) @@ -486,6 +476,31 @@ def sizeof_cudf_series_index(obj): return obj.memory_usage() +# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0 +try: + from dask.dataframe.dispatch import partd_encode_dispatch + + @partd_encode_dispatch.register(cudf.DataFrame) + def _simple_cudf_encode(_): + # Basic pickle-based encoding for a partd k-v store + import pickle + from functools import partial + + import partd + + def join(dfs): + if not dfs: + return cudf.DataFrame() + else: + return cudf.concat(dfs) + + dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) + return partial(partd.Encode, dumps, pickle.loads, join) + +except ImportError: + pass + + def _default_backend(func, *args, **kwargs): # Utility to call a dask.dataframe function with # the default ("pandas") backend diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 5b37e6e825c..17650c9b70d 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -22,7 +22,7 @@ import cudf from cudf import _lib as libcudf -from cudf.utils.utils import _dask_cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate from dask_cudf import sorting from dask_cudf.accessors import ListMethods, StructMethods diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index f4bbcaf4dd1..b1fdf443a17 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -15,7 +15,7 @@ from dask.utils import funcname import cudf -from cudf.utils.utils import _dask_cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate # aggregations that are dask-cudf optimized OPTIMIZED_AGGS = ( diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 85ec36cf2c5..7b4e20012f7 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -148,7 +148,6 @@ def test_roundtrip_from_pandas(tmpdir): def test_strings(tmpdir): - fn = str(tmpdir) dfp = pd.DataFrame( {"a": ["aa", "bbb", "cccc"], "b": ["hello", "dog", "man"]} @@ -161,7 +160,6 @@ def test_strings(tmpdir): def test_dask_timeseries_from_pandas(tmpdir): - fn = str(tmpdir.join("test.parquet")) ddf2 = dask.datasets.timeseries(freq="D") pdf = ddf2.compute() @@ -173,7 +171,6 @@ def test_dask_timeseries_from_pandas(tmpdir): @pytest.mark.parametrize("index", [False, None]) @pytest.mark.parametrize("divisions", [False, True]) def test_dask_timeseries_from_dask(tmpdir, index, divisions): - fn = str(tmpdir) ddf2 = dask.datasets.timeseries(freq="D") ddf2.to_parquet(fn, engine="pyarrow", write_index=index) @@ -188,7 +185,6 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions): @pytest.mark.parametrize("index", [False, None]) @pytest.mark.parametrize("divisions", [False, True]) def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions): - fn = str(tmpdir) ddf2 = dask_cudf.from_cudf( cudf.datasets.timeseries(freq="D"), npartitions=4 @@ -205,7 +201,6 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions): @pytest.mark.parametrize("index", [False, True]) def test_empty(tmpdir, index): - fn = str(tmpdir) dfp = pd.DataFrame({"a": [11.0, 12.0, 12.0], "b": [4, 5, 6]})[:0] if index: @@ -218,7 +213,6 @@ def test_empty(tmpdir, index): def test_filters(tmpdir): - tmp_path = str(tmpdir) df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) ddf = dd.from_pandas(df, npartitions=5) @@ -251,7 +245,6 @@ def test_filters(tmpdir): @pytest.mark.parametrize("numeric", [True, False]) @pytest.mark.parametrize("null", [np.nan, None]) def test_isna_filters(tmpdir, null, numeric): - tmp_path = str(tmpdir) df = pd.DataFrame( { @@ -284,7 +277,6 @@ def test_isna_filters(tmpdir, null, numeric): def test_filters_at_row_group_level(tmpdir): - tmp_path = str(tmpdir) df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) ddf = dd.from_pandas(df, npartitions=5) @@ -405,7 +397,6 @@ def test_split_row_groups(tmpdir, row_groups, index): @need_create_meta @pytest.mark.parametrize("partition_on", [None, "a"]) def test_create_metadata_file(tmpdir, partition_on): - tmpdir = str(tmpdir) # Write ddf without a _metadata file @@ -445,7 +436,6 @@ def test_create_metadata_file(tmpdir, partition_on): @need_create_meta def test_create_metadata_file_inconsistent_schema(tmpdir): - # NOTE: This test demonstrates that the CudfEngine # can be used to generate a global `_metadata` file # even if there are inconsistent schemas in the dataset. diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index d6c9c1be73c..27ba82c390c 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -16,7 +16,7 @@ import cudf as gd from cudf.api.types import is_categorical_dtype -from cudf.utils.utils import _dask_cudf_nvtx_annotate +from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate _SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 94609b180d6..e58255cda06 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -114,3 +114,14 @@ def test_sort_values_empty_string(by): if "a" in by: expect = df.sort_values(by) assert dd.assert_eq(got, expect, check_index=False) + + +def test_disk_shuffle(): + try: + from dask.dataframe.dispatch import partd_encode_dispatch # noqa: F401 + except ImportError: + pytest.skip("need a version of dask that has partd_encode_dispatch") + df = cudf.DataFrame({"a": [1, 2, 3] * 20, "b": [4, 5, 6, 7] * 15}) + ddf = dd.from_pandas(df, npartitions=4) + got = dd.DataFrame.shuffle(ddf, "a", shuffle="disk") + dd.assert_eq(got, df) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 41b57b71749..0306da3de46 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -9,7 +9,7 @@ requires = [ [project] name = "dask_cudf" -version = "23.10.00" +dynamic = ["version", "entry-points"] description = "Utilities for Dask and cuDF interactions" readme = { file = "README.md", content-type = "text/markdown" } authors = [ @@ -18,13 +18,12 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cudf==23.10.*", + "cudf==23.12.*", "cupy-cuda11x>=12.0.0", - "dask==2023.9.2", - "distributed==2023.9.2", "fsspec>=0.6.0", "numpy>=1.21,<1.25", "pandas>=1.3,<1.6.0dev0", + "rapids-dask-dependency==23.12.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", @@ -35,11 +34,10 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", ] -dynamic = ["entry-points"] [project.optional-dependencies] test = [ - "dask-cuda==23.10.*", + "dask-cuda==23.12.*", "numba>=0.57,<0.58", "pytest", "pytest-cov", @@ -52,6 +50,9 @@ Homepage = "https://github.com/rapidsai/cudf" [tool.setuptools] license-files = ["LICENSE"] +[tool.setuptools.dynamic] +version = {file = "dask_cudf/VERSION"} + [tool.isort] line_length = 79 multi_line_output = 3 diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py index 3fa0f257834..c6ce219d32f 100644 --- a/python/dask_cudf/setup.py +++ b/python/dask_cudf/setup.py @@ -2,9 +2,12 @@ from setuptools import find_packages, setup +packages = find_packages(exclude=["tests", "tests.*"]) + setup( include_package_data=True, - packages=find_packages(exclude=["tests", "tests.*"]), + packages=packages, + package_data={key: ["VERSION"] for key in packages}, entry_points={ "dask.dataframe.backends": [ "cudf = dask_cudf.backends:CudfBackendEntrypoint",