diff --git a/.github/labeler.yml b/.github/labeler.yml index 90cdda4d3ca..8506d38a048 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -12,7 +12,7 @@ cudf.polars: - 'python/cudf_polars/**' pylibcudf: - - 'python/cudf/pylibcudf/**' + - 'python/pylibcudf/**' libcudf: - 'cpp/**' diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index d6d3e3fdd33..43fc567cb67 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,19 +57,19 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" sha: ${{ inputs.sha }} wheel-build-libcudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -81,7 +81,7 @@ jobs: wheel-publish-libcudf: needs: wheel-build-libcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -92,7 +92,7 @@ jobs: wheel-build-pylibcudf: needs: [wheel-publish-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -102,7 +102,7 @@ jobs: wheel-publish-pylibcudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -113,7 +113,7 @@ jobs: wheel-build-cudf: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -123,7 +123,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -134,7 +134,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -146,7 +146,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -157,7 +157,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-publish-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -169,7 +169,7 @@ jobs: wheel-publish-cudf-polars: needs: wheel-build-cudf-polars secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10 with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml index d670132cca9..10c803f7921 100644 --- a/.github/workflows/pandas-tests.yaml +++ b/.github/workflows/pandas-tests.yaml @@ -17,7 +17,7 @@ jobs: pandas-tests: # run the Pandas unit tests secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index a4a8f036174..b4c449ce5d8 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -30,6 +30,7 @@ jobs: - wheel-tests-cudf - wheel-build-cudf-polars - wheel-tests-cudf-polars + - cudf-polars-polars-tests - wheel-build-dask-cudf - wheel-tests-dask-cudf - devcontainer @@ -37,7 +38,7 @@ jobs: - pandas-tests - pandas-tests-diff secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10 if: always() with: needs: ${{ toJSON(needs) }} @@ -52,7 +53,7 @@ jobs: steps: - name: Get PR info id: get-pr-info - uses: rapidsai/shared-actions/get-pr-info@main + uses: nv-gha-runners/get-pr-info@main - name: Checkout code repo uses: actions/checkout@v4 with: @@ -104,39 +105,39 @@ jobs: - '!notebooks/**' checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10 with: build_type: pull-request conda-cpp-checks: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: pull-request enable_check_symbols: true conda-cpp-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_cpp == 'true' with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10 with: build_type: pull-request conda-python-cudf-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -145,7 +146,7 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -153,18 +154,18 @@ jobs: conda-java-tests: needs: [conda-cpp-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_java == 'true' with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" static-configure: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -174,28 +175,28 @@ jobs: conda-notebook-tests: needs: [conda-python-build, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 if: needs.changed-files.outputs.test_notebooks == 'true' with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # build for every combination of arch and CUDA version, but only for the latest Python matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber))) @@ -204,21 +205,21 @@ jobs: wheel-build-pylibcudf: needs: [checks, wheel-build-libcudf] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_pylibcudf.sh" wheel-build-cudf: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: build_type: pull-request @@ -226,7 +227,7 @@ jobs: wheel-build-cudf-polars: needs: wheel-build-pylibcudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -235,7 +236,7 @@ jobs: wheel-tests-cudf-polars: needs: [wheel-build-cudf-polars, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -244,10 +245,21 @@ jobs: # This always runs, but only fails if this PR touches code in # pylibcudf or cudf_polars script: "ci/test_wheel_cudf_polars.sh" + cudf-polars-polars-tests: + needs: wheel-build-cudf-polars + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 + with: + # This selects "ARCH=amd64 + the latest supported Python + CUDA". + matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) + build_type: pull-request + # This always runs, but only fails if this PR touches code in + # pylibcudf or cudf_polars + script: "ci/test_cudf_polars_polars_tests.sh" wheel-build-dask-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -256,7 +268,7 @@ jobs: wheel-tests-dask-cudf: needs: [wheel-build-dask-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -265,7 +277,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh devcontainer: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10 with: arch: '["amd64"]' cuda: '["12.5"]' @@ -276,7 +288,7 @@ jobs: unit-tests-cudf-pandas: needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -287,7 +299,7 @@ jobs: # run the Pandas unit tests using PR branch needs: [wheel-build-cudf, changed-files] secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 if: needs.changed-files.outputs.test_python == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". @@ -299,7 +311,7 @@ jobs: pandas-tests-diff: # diff the results of running the Pandas unit tests and publish a job summary needs: pandas-tests - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: node_type: cpu4 build_type: pull-request diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index fe77ad4b6b2..45e5191eb54 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -23,7 +23,7 @@ on: jobs: get-project-id: - uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10 if: github.event.pull_request.state == 'open' secrets: inherit permissions: @@ -34,7 +34,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: @@ -50,7 +50,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" - uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10 if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4af6a0d690d..eb15c2931d2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-checks: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -25,7 +25,7 @@ jobs: enable_check_symbols: true conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -33,7 +33,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -41,11 +41,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: pull-request # Use the wheel container so we can skip conda solves and since our @@ -54,7 +54,7 @@ jobs: run_script: "ci/configure_cpp_static.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -64,7 +64,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -73,7 +73,7 @@ jobs: script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -81,11 +81,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -93,11 +93,11 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:latest" + container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -106,7 +106,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -117,7 +117,7 @@ jobs: script: ci/test_wheel_dask_cudf.sh unit-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} @@ -126,7 +126,7 @@ jobs: script: ci/cudf_pandas_scripts/run_tests.sh third-party-integration-tests-cudf-pandas: secrets: inherit - uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12 + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10 with: build_type: nightly branch: ${{ inputs.branch }} diff --git a/README.md b/README.md index f62f7885d63..8f8c2adac2f 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12 ### Conda -cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel: +cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: ```bash conda install -c rapidsai -c conda-forge -c nvidia \ diff --git a/build.sh b/build.sh index 211e1db9fbf..56359eae235 100755 --- a/build.sh +++ b/build.sh @@ -17,13 +17,14 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" -HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] +VALIDARGS="clean libcudf pylibcudf cudf cudf_polars cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" +HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudf_polars] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) libcudf - build the cudf C++ code only pylibcudf - build the pylibcudf Python package cudf - build the cudf Python package + cudf_polars - build the cudf_polars Python package cudfjar - build cudf JAR with static libcudf using devtoolset toolchain dask_cudf - build the dask_cudf Python package benchmarks - build benchmarks @@ -239,11 +240,6 @@ if hasArg --pydevelop; then PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e" fi -# Append `-DFIND_CUDF_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option. -if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then - EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON" -fi - if hasArg --disable_large_strings; then BUILD_DISABLE_LARGE_STRINGS="ON" fi @@ -358,6 +354,12 @@ if buildAll || hasArg cudf; then python ${PYTHON_ARGS_FOR_INSTALL} . fi +# Build and install the cudf_polars Python package +if buildAll || hasArg cudf_polars; then + + cd ${REPODIR}/python/cudf_polars + python ${PYTHON_ARGS_FOR_INSTALL} . +fi # Build and install the dask_cudf Python package if buildAll || hasArg dask_cudf; then diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index 93a815838b7..7a12db927e5 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -68,8 +68,18 @@ def emoji_failed(x): pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() diff_df = pr_df - main_df +total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call'] +pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1) +pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1) -pr_df = pr_df[["total", "passed", "failed", "skipped"]] +cpu_usage_mean = pr_df['CPU Usage'].mean().round(2) +gpu_usage_mean = pr_df['GPU Usage'].mean().round(2) + +# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns +pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%' +pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%' + +pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']] diff_df = diff_df[["total", "passed", "failed", "skipped"]] diff_df.columns = diff_df.columns + "_diff" diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) @@ -95,6 +105,8 @@ def emoji_failed(x): print(comment) print() +print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%") +print() print("Here are the results of running the Pandas tests against this PR:") print() print(df.to_markdown()) diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index c6228a4ef33..f6bdc6f9484 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -56,10 +56,10 @@ else echo "" > ./constraints.txt if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then - # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]` + # `test_python_cudf_pandas` constraints are for `[test]` not `[cudf-pandas-tests]` rapids-dependency-file-generator \ --output requirements \ - --file-key test_python \ + --file-key test_python_cudf_pandas \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee ./constraints.txt fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index be55b49870f..f73e88bc0c8 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -25,9 +25,9 @@ NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}') NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} # Need to distutils-normalize the versions for some use cases -CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))") -NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))") -PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))") +CURRENT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${CURRENT_SHORT_TAG}'))") +NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))") +PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))") echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" @@ -45,6 +45,8 @@ sed_runner "s/branch-.*/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_dask_cudf.sh DEPENDENCIES=( cudf cudf_kafka + cugraph + cuml custreamz dask-cuda dask-cudf @@ -57,7 +59,7 @@ DEPENDENCIES=( rmm ) for DEP in "${DEPENDENCIES[@]}"; do - for FILE in dependencies.yaml conda/environments/*.yaml; do + for FILE in dependencies.yaml conda/environments/*.yaml python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml; do sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}" done for FILE in python/*/pyproject.toml; do diff --git a/ci/run_cudf_polars_polars_tests.sh b/ci/run_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..95f78f17f2f --- /dev/null +++ b/ci/run_cudf_polars_polars_tests.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +# Support invoking run_cudf_polars_pytests.sh outside the script directory +# Assumption, polars has been cloned in the root of the repo. +cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/ + +DESELECTED_TESTS=( + "tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place + "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode + "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error + "tests/docs/test_user_guide.py" # No dot binary in CI image +) + +DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}") +python -m pytest \ + --import-mode=importlib \ + --cache-clear \ + -m "" \ + -p cudf_polars.testing.plugin \ + -v \ + --tb=native \ + ${DESELECTED_TESTS} \ + "$@" \ + py-polars/tests diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh new file mode 100755 index 00000000000..bfc8fd37565 --- /dev/null +++ b/ci/test_cudf_polars_polars_tests.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -eou pipefail + +# We will only fail these tests if the PR touches code in pylibcudf +# or cudf_polars itself. +# Note, the three dots mean we are doing diff between the merge-base +# of upstream and HEAD. So this is asking, "does _this branch_ touch +# files in cudf_polars/pylibcudf", rather than "are there changes +# between upstream and this branch which touch cudf_polars/pylibcudf" +# TODO: is the target branch exposed anywhere in an environment variable? +if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ]; +then + HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" +else + HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" +fi + +rapids-logger "Download wheels" + +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" +RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist + +# Download the pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep + +rapids-logger "Install pylibcudf" +python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl + +rapids-logger "Install cudf_polars" +python -m pip install $(echo ./dist/cudf_polars*.whl) + +TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') +rapids-logger "Clone polars to ${TAG}" +git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1 + +# Install requirements for running polars tests +rapids-logger "Install polars test requirements" +python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt + +function set_exitcode() +{ + EXITCODE=$? +} +EXITCODE=0 +trap set_exitcode ERR +set +e + +rapids-logger "Run polars tests" +./ci/run_cudf_polars_polars_tests.sh + +trap ERR +set -e + +if [ ${EXITCODE} != 0 ]; then + rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}" +else + rapids-logger "Running polars test suite PASSED" +fi + +if [ ${HAS_CHANGES} == 1 ]; then + exit ${EXITCODE} +else + exit 0 +fi diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index d0675b0431a..dc70661a17a 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -10,10 +10,10 @@ set -euo pipefail rapids-logger "Generate Python testing dependencies" ENV_YAML_DIR="$(mktemp -d)" - +FILE_KEY=$1 rapids-dependency-file-generator \ --output conda \ - --file-key test_python \ + --file-key ${FILE_KEY} \ --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ | tee "${ENV_YAML_DIR}/env.yaml" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index ae34047e87f..2386414b32e 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../; # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index 06a24773cae..67c97ad29a5 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -5,7 +5,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ # Common setup steps shared by Python test jobs -source ./ci/test_python_common.sh +source ./ci/test_python_common.sh test_python_other rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index 28ded2f8e0f..a701bfe15e0 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -39,6 +39,7 @@ rapids-logger "pytest pylibcudf" pushd python/pylibcudf/pylibcudf/tests python -m pytest \ --cache-clear \ + --numprocesses=8 \ --dist=worksteal \ . popd diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh index 9844090258a..3116bd820e9 100755 --- a/ci/test_wheel_cudf_polars.sh +++ b/ci/test_wheel_cudf_polars.sh @@ -13,10 +13,14 @@ set -eou pipefail if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ]; then HAS_CHANGES=1 + rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure" else HAS_CHANGES=0 + rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure" fi +rapids-logger "Download wheels" + RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist @@ -35,7 +39,7 @@ if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then | tee ./constraints.txt fi -# echo to expand wildcard before adding `[extra]` requires for pip +# echo to expand wildcard before adding `[test]` requires for pip python -m pip install \ -v \ --constraint ./constraints.txt \ diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 0d39807d56c..361a42ccda9 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -41,6 +41,7 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd @@ -50,5 +51,6 @@ pushd python/dask_cudf/dask_cudf DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ + --dist=worksteal \ . popd diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index c96e8706d27..16b3d112992 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -31,7 +31,7 @@ dependencies: - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -84,7 +84,7 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index e54a44d9f6e..cce2e0eea84 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -32,7 +32,7 @@ dependencies: - doxygen=1.9.1 - fastavro>=0.22.9 - flatbuffers==24.3.25 -- fmt>=10.1.1,<11 +- fmt>=11.0.2,<12 - fsspec>=0.6.0 - gcc_linux-64=11.* - hypothesis @@ -82,7 +82,7 @@ dependencies: - s3fs>=2022.3.0 - scikit-build-core>=0.10.0 - scipy -- spdlog>=1.12.0,<1.13 +- spdlog>=1.14.1,<1.15 - sphinx - sphinx-autobuild - sphinx-copybutton diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 33fa4b4eccf..dc75eb4b252 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -26,13 +26,13 @@ librdkafka_version: - ">=2.5.0,<2.6.0a0" fmt_version: - - ">=10.1.1,<11" + - ">=11.0.2,<12" flatbuffers_version: - "=24.3.25" spdlog_version: - - ">=1.12.0,<1.13" + - ">=1.14.1,<1.15" nvcomp_version: - "=4.0.1" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bc01e64441..84b462bb884 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,7 @@ add_library( src/io/csv/reader_impl.cu src/io/csv/writer_impl.cu src/io/functions.cpp + src/io/json/host_tree_algorithms.cu src/io/json/json_column.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu @@ -797,7 +798,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC CCCL::CCCL rmm::rmm $ + PUBLIC CCCL::CCCL rmm::rmm $ spdlog::spdlog_header_only PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ nanoarrow ) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 3bf9d02b384..4113e38dcf4 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -177,11 +177,11 @@ ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp) # ################################################################################################## # * nds-h benchmark -------------------------------------------------------------------------------- -ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp) -ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q01_NVBENCH ndsh/q01.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q05_NVBENCH ndsh/q05.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q06_NVBENCH ndsh/q06.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q09_NVBENCH ndsh/q09.cpp ndsh/utilities.cpp) +ConfigureNVBench(NDSH_Q10_NVBENCH ndsh/q10.cpp ndsh/utilities.cpp) # ################################################################################################## # * stream_compaction benchmark ------------------------------------------------------------------- @@ -230,6 +230,11 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp) # -------------------------------------------------------------------------------- ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp) +# ################################################################################################## +# * tdigest benchmark +# -------------------------------------------------------------------------------- +ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cu) + # ################################################################################################## # * type_dispatcher benchmark --------------------------------------------------------------------- ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu) @@ -337,7 +342,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) ConfigureNVBench( TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp index d4368906702..54d177df401 100644 --- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp +++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp @@ -85,7 +85,7 @@ std::unique_ptr perform_left_join(cudf::table_view const& left_inpu auto const left_selected = left_input.select(left_on); auto const right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr); + cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, stream, mr); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 7563c823454..ce115fd7723 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64; void parquet_read_common(cudf::size_type num_rows_to_read, cudf::size_type num_cols_to_read, cuio_source_sink_pair& source_sink, - nvbench::state& state) + nvbench::state& state, + size_t table_data_size = data_size) { cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); @@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read, }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_element_count(static_cast(table_data_size) / time, "bytes_per_second"); state.add_buffer_size( mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); @@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list +void BM_parquet_read_wide_tables(nvbench::state& state, + nvbench::type_list> type_list) +{ + auto const d_type = get_type_or_group(static_cast(DataType)); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + +void BM_parquet_read_wide_tables_mixed(nvbench::state& state) +{ + auto const d_type = []() { + auto d_type1 = get_type_or_group(static_cast(data_type::INTEGRAL)); + auto d_type2 = get_type_or_group(static_cast(data_type::FLOAT)); + d_type1.reserve(d_type1.size() + d_type2.size()); + std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1)); + return d_type1; + }(); + + auto const n_col = static_cast(state.get_int64("num_cols")); + auto const data_size_bytes = static_cast(state.get_int64("data_size_mb") << 20); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const run_length = static_cast(state.get_int64("run_length")); + auto const source_type = io_type::DEVICE_BUFFER; + cuio_source_sink_pair source_sink(source_type); + + auto const num_rows_written = [&]() { + auto const tbl = create_random_table( + cycle_dtypes(d_type, n_col), + table_size_bytes{data_size_bytes}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::parquet_writer_options write_opts = + cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_parquet(write_opts); + return view.num_rows(); + }(); + + parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes); +} + using d_type_list = nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table)) + .set_name("parquet_read_wide_tables") + .set_min_samples(4) + .set_type_axes_names({"data_type"}) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + +NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed) + .set_name("parquet_read_wide_tables_mixed") + .set_min_samples(4) + .add_int64_axis("data_size_mb", {1024, 2048, 4096}) + .add_int64_axis("num_cols", {256, 512, 1024}) + .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("run_length", {1, 32}); + // a benchmark for structs that only contain fixed-width types using d_type_list_struct_only = nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only)) diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index 3abd4280081..7121cb9f034 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state) } std::tuple, size_t, size_t> write_file_data( - nvbench::state& state, std::vector const& d_types) + nvbench::state& state, std::vector const& d_types, io_type io_source_type) { cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); @@ -63,7 +63,7 @@ std::tuple, size_t, size_t> write_file_data( size_t total_file_size = 0; for (size_t i = 0; i < num_files; ++i) { - cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; + cuio_source_sink_pair source_sink{io_source_type}; auto const tbl = create_random_table( cycle_dtypes(d_types, num_cols), @@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state, { size_t const data_size = state.get_int64("total_data_size"); auto const num_threads = state.get_int64("num_threads"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state, auto const num_threads = state.get_int64("num_threads"); size_t const input_limit = state.get_int64("input_limit"); size_t const output_limit = state.get_int64("output_limit"); + auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); BS::thread_pool threads(num_threads); - auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + auto [source_sink_vector, total_file_size, num_files] = + write_file_data(state, d_types, source_type); std::vector source_info_vector; std::transform(source_sink_vector.begin(), source_sink_vector.end(), @@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .set_name("parquet_multithreaded_read_decode_fixed_width") @@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .set_name("parquet_multithreaded_read_decode_string") @@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .set_name("parquet_multithreaded_read_decode_list") @@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list) .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024}) .add_int64_axis("num_threads", {1, 2, 4, 8}) .add_int64_axis("num_cols", {4}) - .add_int64_axis("run_length", {8}); + .add_int64_axis("run_length", {8}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); // mixed data types: fixed width, strings NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) @@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .set_name("parquet_multithreaded_read_decode_chunked_fixed_width") @@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .set_name("parquet_multithreaded_read_decode_chunked_string") @@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .set_name("parquet_multithreaded_read_decode_chunked_list") @@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list) .add_int64_axis("num_cols", {4}) .add_int64_axis("run_length", {8}) .add_int64_axis("input_limit", {640 * 1024 * 1024}) - .add_int64_axis("output_limit", {640 * 1024 * 1024}); + .add_int64_axis("output_limit", {640 * 1024 * 1024}) + .add_string_axis("io_type", {"PINNED_BUFFER"}); diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 2d514764fc2..62116ddf661 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -146,11 +147,15 @@ std::unique_ptr join_and_gather(cudf::table_view const& left_input, cudf::null_equality compare_nulls) { CUDF_FUNC_RANGE(); - constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; - auto const left_selected = left_input.select(left_on); - auto const right_selected = right_input.select(right_on); - auto const [left_join_indices, right_join_indices] = cudf::inner_join( - left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref()); + constexpr auto oob_policy = cudf::out_of_bounds_policy::DONT_CHECK; + auto const left_selected = left_input.select(left_on); + auto const right_selected = right_input.select(right_on); + auto const [left_join_indices, right_join_indices] = + cudf::inner_join(left_selected, + right_selected, + compare_nulls, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_indices_span = cudf::device_span{*left_join_indices}; auto const right_indices_span = cudf::device_span{*right_join_indices}; diff --git a/cpp/benchmarks/quantiles/tdigest.cu b/cpp/benchmarks/quantiles/tdigest.cu new file mode 100644 index 00000000000..9d37dbc9a26 --- /dev/null +++ b/cpp/benchmarks/quantiles/tdigest.cu @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include +#include +#include + +#include + +void bm_tdigest_merge(nvbench::state& state) +{ + auto const num_tdigests = static_cast(state.get_int64("num_tdigests")); + auto const tdigest_size = static_cast(state.get_int64("tdigest_size")); + auto const tdigests_per_group = + static_cast(state.get_int64("tdigests_per_group")); + auto const max_centroids = static_cast(state.get_int64("max_centroids")); + auto const num_groups = num_tdigests / tdigests_per_group; + auto const total_centroids = num_tdigests * tdigest_size; + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + + constexpr int base_value = 5; + + // construct inner means/weights + auto val_iter = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([tdigest_size](cudf::size_type i) { + return static_cast(base_value + (i % tdigest_size)); + })); + auto one_iter = thrust::make_constant_iterator(1); + cudf::test::fixed_width_column_wrapper means(val_iter, val_iter + total_centroids); + cudf::test::fixed_width_column_wrapper weights(one_iter, one_iter + total_centroids); + std::vector> inner_struct_children; + inner_struct_children.push_back(means.release()); + inner_struct_children.push_back(weights.release()); + cudf::test::structs_column_wrapper inner_struct(std::move(inner_struct_children)); + + // construct the tdigest lists themselves + auto offset_iter = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([tdigest_size](cudf::size_type i) { + return i * tdigest_size; + })); + cudf::test::fixed_width_column_wrapper offsets(offset_iter, offset_iter + num_tdigests + 1); + auto list_col = cudf::make_lists_column( + num_tdigests, offsets.release(), inner_struct.release(), 0, {}, stream, mr); + + // min and max columns + auto min_iter = thrust::make_constant_iterator(base_value); + auto max_iter = thrust::make_constant_iterator(base_value + (tdigest_size - 1)); + cudf::test::fixed_width_column_wrapper mins(min_iter, min_iter + num_tdigests); + cudf::test::fixed_width_column_wrapper maxes(max_iter, max_iter + num_tdigests); + + // assemble the whole thing + std::vector> tdigest_children; + tdigest_children.push_back(std::move(list_col)); + tdigest_children.push_back(mins.release()); + tdigest_children.push_back(maxes.release()); + cudf::test::structs_column_wrapper tdigest(std::move(tdigest_children)); + + rmm::device_uvector group_offsets(num_groups + 1, stream, mr); + rmm::device_uvector group_labels(num_tdigests, stream, mr); + auto group_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [tdigests_per_group] __device__(cudf::size_type i) { return i * tdigests_per_group; })); + thrust::copy(rmm::exec_policy_nosync(stream, mr), + group_offset_iter, + group_offset_iter + num_groups + 1, + group_offsets.begin()); + auto group_label_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [tdigests_per_group] __device__(cudf::size_type i) { return i / tdigests_per_group; })); + thrust::copy(rmm::exec_policy_nosync(stream, mr), + group_label_iter, + group_label_iter + num_tdigests, + group_labels.begin()); + + state.add_element_count(total_centroids); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { + timer.start(); + auto result = cudf::tdigest::detail::group_merge_tdigest( + tdigest, group_offsets, group_labels, num_groups, max_centroids, stream, mr); + timer.stop(); + }); +} + +NVBENCH_BENCH(bm_tdigest_merge) + .set_name("TDigest many tiny groups") + .add_int64_axis("num_tdigests", {500'000}) + .add_int64_axis("tdigest_size", {1, 1000}) + .add_int64_axis("tdigests_per_group", {1}) + .add_int64_axis("max_centroids", {10000, 1000}); + +NVBENCH_BENCH(bm_tdigest_merge) + .set_name("TDigest many small groups") + .add_int64_axis("num_tdigests", {500'000}) + .add_int64_axis("tdigest_size", {1, 1000}) + .add_int64_axis("tdigests_per_group", {3}) + .add_int64_axis("max_centroids", {10000, 1000}); diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp new file mode 100644 index 00000000000..adc3dddc59c --- /dev/null +++ b/cpp/benchmarks/text/word_minhash.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include + +#include + +#include + +static void bench_word_minhash(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const base64 = state.get_int64("hash_type") == 64; + + data_profile const strings_profile = + data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); + auto strings_table = + create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); + + auto const num_offsets = (num_rows / row_width) + 1; + auto offsets = cudf::sequence(num_offsets, + cudf::numeric_scalar(0), + cudf::numeric_scalar(row_width)); + + auto source = cudf::make_lists_column(num_offsets - 1, + std::move(offsets), + std::move(strings_table->release().front()), + 0, + rmm::device_buffer{}); + + data_profile const seeds_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), distribution_id::NORMAL, 0, 256); + auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); + auto seeds = seeds_table->get_column(0); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + + cudf::strings_column_view input(cudf::lists_column_view(source->view()).child()); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(num_rows); // output are hashes + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view()) + : nvtext::word_minhash(source->view(), seeds.view()); + }); +} + +NVBENCH_BENCH(bench_word_minhash) + .set_name("word_minhash") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152}) + .add_int64_axis("row_width", {10, 100, 1000}) + .add_int64_axis("seed_count", {2, 25}) + .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake index c0e07d02d94..90b0f4d8a8e 100644 --- a/cpp/cmake/thirdparty/get_spdlog.cmake +++ b/cpp/cmake/thirdparty/get_spdlog.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,21 +16,12 @@ function(find_and_configure_spdlog) include(${rapids-cmake-dir}/cpm/spdlog.cmake) - rapids_cpm_spdlog(FMT_OPTION "EXTERNAL_FMT_HO" INSTALL_EXPORT_SET cudf-exports) - rapids_export_package(BUILD spdlog cudf-exports) + rapids_cpm_spdlog( + FMT_OPTION "EXTERNAL_FMT_HO" + INSTALL_EXPORT_SET cudf-exports + BUILD_EXPORT_SET cudf-exports + ) - if(spdlog_ADDED) - rapids_export( - BUILD spdlog - EXPORT_SET spdlog - GLOBAL_TARGETS spdlog spdlog_header_only - NAMESPACE spdlog:: - ) - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD spdlog [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) - endif() endfunction() find_and_configure_spdlog() diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 8d206f245dc..6d1c91a5752 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions. **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen. +By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include: +- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`) +- Line separator (Unicode: `2028`, UTF-8: `E280A8`) +- Next line (Unicode: `0085`, UTF-8: `C285`) +- Carriage return (Unicode: `000D`, UTF-8: `0D`) + **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following: - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals. - Unmatched paired special characters like `()`, `[]`, and `{}`. diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 442731694fa..9cda22d0695 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -18,6 +18,8 @@ #include "../utilities/timer.hpp" +#include + /** * @file parquet_io.cpp * @brief Demonstrates usage of the libcudf APIs to read and write @@ -159,8 +161,11 @@ int main(int argc, char const** argv) // Left anti-join the original and transcoded tables // identical tables should not throw an exception and // return an empty indices vector - auto const indices = cudf::left_anti_join( - input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get()); + auto const indices = cudf::left_anti_join(input->view(), + transcoded_input->view(), + cudf::null_equality::EQUAL, + cudf::get_default_stream(), + resource.get()); // No exception thrown, check indices auto const valid = indices->size() == 0; diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index c7523c80b2b..7359a0d5fde 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -17,9 +17,12 @@ #pragma once #include +#include #include #include +#include + #include /** @@ -40,6 +43,7 @@ namespace datetime { * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t years @@ -47,6 +51,7 @@ namespace datetime { */ std::unique_ptr extract_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -54,6 +59,7 @@ std::unique_ptr extract_year( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t months @@ -61,6 +67,7 @@ std::unique_ptr extract_year( */ std::unique_ptr extract_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -68,6 +75,7 @@ std::unique_ptr extract_month( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -75,6 +83,7 @@ std::unique_ptr extract_month( */ std::unique_ptr extract_day( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -82,6 +91,7 @@ std::unique_ptr extract_day( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t days @@ -89,6 +99,7 @@ std::unique_ptr extract_day( */ std::unique_ptr extract_weekday( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -96,6 +107,7 @@ std::unique_ptr extract_weekday( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t hours @@ -103,6 +115,7 @@ std::unique_ptr extract_weekday( */ std::unique_ptr extract_hour( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -110,6 +123,7 @@ std::unique_ptr extract_hour( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t minutes @@ -117,6 +131,7 @@ std::unique_ptr extract_hour( */ std::unique_ptr extract_minute( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -124,6 +139,7 @@ std::unique_ptr extract_minute( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t seconds @@ -131,6 +147,7 @@ std::unique_ptr extract_minute( */ std::unique_ptr extract_second( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -141,6 +158,7 @@ std::unique_ptr extract_second( * For example, the millisecond fraction of 1.234567890 seconds is 234. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t milliseconds @@ -148,6 +166,7 @@ std::unique_ptr extract_second( */ std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -158,6 +177,7 @@ std::unique_ptr extract_millisecond_fraction( * For example, the microsecond fraction of 1.234567890 seconds is 567. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t microseconds @@ -165,6 +185,7 @@ std::unique_ptr extract_millisecond_fraction( */ std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -175,6 +196,7 @@ std::unique_ptr extract_microsecond_fraction( * For example, the nanosecond fraction of 1.234567890 seconds is 890. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of the extracted int16_t nanoseconds @@ -182,6 +204,7 @@ std::unique_ptr extract_microsecond_fraction( */ std::unique_ptr extract_nanosecond_fraction( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group @@ -196,6 +219,7 @@ std::unique_ptr extract_nanosecond_fraction( * cudf::column. * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column containing last day of the month as TIMESTAMP_DAYS @@ -203,6 +227,7 @@ std::unique_ptr extract_nanosecond_fraction( */ std::unique_ptr last_day_of_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -210,6 +235,7 @@ std::unique_ptr last_day_of_month( * returns an int16_t cudf::column. The value is between [1, {365-366}] * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype INT16 containing the day number since the start of the year @@ -217,6 +243,7 @@ std::unique_ptr last_day_of_month( */ std::unique_ptr day_of_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -245,6 +272,7 @@ std::unique_ptr day_of_year( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::column_view of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of timestamp type containing the computed timestamps @@ -252,6 +280,7 @@ std::unique_ptr day_of_year( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::column_view const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -280,6 +309,7 @@ std::unique_ptr add_calendrical_months( * * @param timestamps cudf::column_view of timestamp type * @param months cudf::scalar of integer type containing the number of months to add + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @return cudf::column of timestamp type containing the computed timestamps @@ -287,6 +317,7 @@ std::unique_ptr add_calendrical_months( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::scalar const& months, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -297,6 +328,7 @@ std::unique_ptr add_calendrical_months( * `output[i] is null` if `column[i]` is null * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @returns cudf::column of datatype BOOL8 truth value of the corresponding date @@ -304,6 +336,7 @@ std::unique_ptr add_calendrical_months( */ std::unique_ptr is_leap_year( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -315,11 +348,13 @@ std::unique_ptr is_leap_year( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column cudf::column_view of the input datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return cudf::column of datatype INT16 of days in month of the corresponding date */ std::unique_ptr days_in_month( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -331,11 +366,13 @@ std::unique_ptr days_in_month( * @throw cudf::logic_error if input column datatype is not a TIMESTAMP * * @param column The input column containing datetime values + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * @return A column of INT16 type indicating which quarter the date is in */ std::unique_ptr extract_quarter( cudf::column_view const& column, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -357,6 +394,7 @@ enum class rounding_frequency : int32_t { * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round up to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -365,6 +403,7 @@ enum class rounding_frequency : int32_t { std::unique_ptr ceil_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -372,6 +411,7 @@ std::unique_ptr ceil_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round down to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -380,6 +420,7 @@ std::unique_ptr ceil_datetimes( std::unique_ptr floor_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -387,6 +428,7 @@ std::unique_ptr floor_datetimes( * * @param column cudf::column_view of the input datetime values * @param freq rounding_frequency indicating the frequency to round to + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column * * @throw cudf::logic_error if input column datatype is not TIMESTAMP. @@ -395,6 +437,7 @@ std::unique_ptr floor_datetimes( std::unique_ptr round_datetimes( cudf::column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index b257eef1e9e..4255faea702 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1497,8 +1497,7 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation); * * @tparam F Type of callable * @param k The `aggregation::Kind` value to dispatch - * aram f The callable that accepts an `aggregation::Kind` non-type template - * argument. + * @param f The callable that accepts an `aggregation::Kind` callable function object. * @param args Parameter pack forwarded to the `operator()` invocation * @return Forwards the return value of the callable. */ @@ -1626,6 +1625,7 @@ struct dispatch_source { * parameter of the callable `F` * @param k The `aggregation::Kind` used to dispatch an `aggregation::Kind` * non-type template parameter for the second template parameter of the callable + * @param f The callable that accepts `data_type` and `aggregation::Kind` function object. * @param args Parameter pack forwarded to the `operator()` invocation * `F`. */ @@ -1644,8 +1644,8 @@ CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(d * @brief Returns the target `data_type` for the specified aggregation k * performed on elements of type source_type. * - * aram source_type The element type to be aggregated - * aram k The aggregation + * @param source_type The element type to be aggregated + * @param k The aggregation kind * @return data_type The target_type of k performed on source_type * elements */ diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 31782cbaf8a..9db7e48498f 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -26,111 +26,108 @@ namespace CUDF_EXPORT cudf { namespace datetime { namespace detail { /** - * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_year(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_day(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_day(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_weekday(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_hour(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_minute(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::extract_second(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_second(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_millisecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_microsecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, + * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr extract_nanosecond_fraction(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr last_day_of_month(cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); /** - * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr day_of_year(cudf::column_view const& column, rmm::cuda_stream_view stream, @@ -138,9 +135,8 @@ std::unique_ptr day_of_year(cudf::column_view const& column, /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::column_view const& months, @@ -149,9 +145,8 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti /** * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr add_calendrical_months(cudf::column_view const& timestamps, cudf::scalar const& months, @@ -159,9 +154,9 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti rmm::device_async_resource_ref mr); /** - * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref) + * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::cuda_stream_view, + * rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr is_leap_year(cudf::column_view const& column, rmm::cuda_stream_view stream, diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index 672b95e2d01..80a4460023f 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -143,29 +143,28 @@ std::unique_ptr make_tdigest_column(size_type num_rows, rmm::device_async_resource_ref mr); /** - * @brief Create a tdigest column of empty clusters. + * @brief Create an empty tdigest column. * - * The column created contains the specified number of rows of empty clusters. + * An empty tdigest column contains a single row of length 0 * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns A tdigest column of empty clusters. + * @returns An empty tdigest column. */ CUDF_EXPORT -std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** - * @brief Create a scalar of an empty tdigest cluster. + * @brief Create an empty tdigest scalar. * - * The returned scalar is a struct_scalar that contains a single row of an empty cluster. + * An empty tdigest scalar is a struct_scalar that contains a single row of length 0 * * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * - * @returns A scalar of an empty tdigest cluster. + * @returns An empty tdigest scalar. */ std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp index 5738f9ec8e9..f51d1ba42b2 100644 --- a/cpp/include/cudf/detail/timezone.hpp +++ b/cpp/include/cudf/detail/timezone.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include @@ -26,14 +27,13 @@ namespace detail { /** * @copydoc cudf::make_timezone_transition_table(std::optional, std::string_view, - * rmm::device_async_resource_ref) + * rmm::cuda_stream_view, rmm::device_async_resource_ref) * - * @param stream CUDA stream used for device memory operations and kernel launches. */ std::unique_ptr make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, - rmm::cuda_stream_view stream, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace detail diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 73ff17b2b93..940d03cdb41 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -69,11 +69,21 @@ void normalize_single_quotes(datasource::owning_buffer& inda * @brief Normalize unquoted whitespace (space and tab characters) using FST * * @param indata Input device buffer + * @param col_offsets Offsets to column contents in input buffer + * @param col_lengths Length of contents of each row in column * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation + * + * @returns Tuple of the normalized column, offsets to each row in column, and lengths of contents + * of each row */ -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace io::json::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ed7b2ac0850..ee03a382bec 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -39,8 +39,9 @@ namespace io { * @file */ -constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; ///< 128MB per row group -constexpr size_type default_row_group_size_rows = 1000000; ///< 1 million rows per row group +constexpr size_t default_row_group_size_bytes = + std::numeric_limits::max(); ///< Infinite bytes per row group +constexpr size_type default_row_group_size_rows = 1'000'000; ///< 1 million rows per row group constexpr size_t default_max_page_size_bytes = 512 * 1024; ///< 512KB per page constexpr size_type default_max_page_size_rows = 20000; ///< 20k rows per page constexpr int32_t default_column_index_truncate_length = 64; ///< truncate to 64 bytes diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index cc8912cb022..a590eb27511 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -97,6 +97,7 @@ class distinct_hash_join; * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -108,6 +109,7 @@ std::pair>, inner_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -137,6 +139,7 @@ inner_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -148,6 +151,7 @@ std::pair>, left_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -176,6 +180,7 @@ left_join(cudf::table_view const& left_keys, * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -187,6 +192,7 @@ std::pair>, full_join(cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -205,6 +211,7 @@ full_join(cudf::table_view const& left_keys, * @param left_keys The left table * @param right_keys The right table * @param compare_nulls Controls whether null join-key values should match or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct @@ -215,6 +222,7 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -236,6 +244,7 @@ std::unique_ptr> left_semi_join( * @param[in] right_keys The right table * @param[in] compare_nulls controls whether null join-key values * should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A column `left_indices` that can be used to construct @@ -246,6 +255,7 @@ std::unique_ptr> left_anti_join( cudf::table_view const& left_keys, cudf::table_view const& right_keys, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -266,6 +276,7 @@ std::unique_ptr> left_anti_join( * * @param left The left table * @param right The right table + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory * * @return Result of cross joining `left` and `right` tables @@ -273,6 +284,7 @@ std::unique_ptr> left_anti_join( std::unique_ptr cross_join( cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -567,6 +579,7 @@ class distinct_hash_join { * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -578,6 +591,7 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -612,6 +626,7 @@ conditional_inner_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -623,6 +638,7 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -655,6 +671,7 @@ conditional_left_join(table_view const& left, * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -665,6 +682,7 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -693,6 +711,7 @@ conditional_full_join(table_view const& left, * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -704,6 +723,7 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -732,6 +752,7 @@ std::unique_ptr> conditional_left_semi_join( * @param right The right table * @param binary_predicate The condition on which to join * @param output_size Optional value which allows users to specify the exact output size + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A vector `left_indices` that can be used to construct the result of @@ -743,6 +764,7 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -786,6 +808,7 @@ std::unique_ptr> conditional_left_anti_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_inner_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -801,6 +824,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -846,6 +870,7 @@ mixed_inner_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_left_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -861,6 +886,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -906,6 +932,7 @@ mixed_left_join( * @param output_size_data An optional pair of values indicating the exact output size and the * number of matches for each row in the larger of the two input tables, left or right (may be * precomputed using the corresponding mixed_full_join_size API). + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -921,6 +948,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, std::optional>> output_size_data = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -956,6 +984,7 @@ mixed_full_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -968,6 +997,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1004,6 +1034,7 @@ std::unique_ptr> mixed_left_semi_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct @@ -1016,6 +1047,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1041,6 +1073,7 @@ std::unique_ptr> mixed_left_anti_join( * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1056,6 +1089,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1081,6 +1115,7 @@ std::pair>> mixed_in * @param right_conditional The right table used for the conditional join * @param binary_predicate The condition on which to join * @param compare_nulls Whether or not null values join to each other or not + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return A pair containing the size that would result from performing the @@ -1096,6 +1131,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls = null_equality::EQUAL, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1111,6 +1147,7 @@ std::pair>> mixed_le * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1119,6 +1156,7 @@ std::size_t conditional_inner_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1134,6 +1172,7 @@ std::size_t conditional_inner_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1142,6 +1181,7 @@ std::size_t conditional_left_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1157,6 +1197,7 @@ std::size_t conditional_left_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1165,6 +1206,7 @@ std::size_t conditional_left_semi_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @@ -1180,6 +1222,7 @@ std::size_t conditional_left_semi_join_size( * @param left The left table * @param right The right table * @param binary_predicate The condition on which to join + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device memory * * @return The size that would result from performing the requested join @@ -1188,6 +1231,7 @@ std::size_t conditional_left_anti_join_size( table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index f7108129dee..4f3fc7086f2 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -35,10 +35,11 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, ///< default - MULTILINE = 8, ///< the '^' and '$' honor new-line characters - DOTALL = 16, ///< the '.' matching includes new-line characters - ASCII = 256 ///< use only ASCII when matching built-in character classes + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16, ///< the '.' matching includes new-line characters + ASCII = 256, ///< use only ASCII when matching built-in character classes + EXT_NEWLINE = 512 ///< new-line matches extended characters }; /** @@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f) return (f & regex_flags::ASCII) == regex_flags::ASCII; } +/** + * @brief Returns true if the given flags contain EXT_NEWLINE + * + * @param f Regex flags to check + * @return true if `f` includes EXT_NEWLINE + */ +constexpr bool is_ext_newline(regex_flags const f) +{ + return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE; +} + /** * @brief Capture groups setting * diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index abb26d7ccb4..14695c3bb27 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper __device__ inline string_view::const_iterator& string_view::const_iterator::operator--() { - if (byte_pos > 0) - while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) - ; + if (byte_pos > 0) { + if (byte_pos == char_pos) { + --byte_pos; + } else { + while (strings::detail::bytes_in_utf8_byte(static_cast(p[--byte_pos])) == 0) + ; + } + } --char_pos; return *this; } diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp index aa903770e26..f6de1056c24 100644 --- a/cpp/include/cudf/timezone.hpp +++ b/cpp/include/cudf/timezone.hpp @@ -15,9 +15,12 @@ */ #pragma once +#include #include #include +#include + #include #include #include @@ -43,6 +46,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; * * @param tzif_dir The directory where the TZif files are located * @param timezone_name standard timezone name (for example, "America/Los_Angeles") + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory. * * @return The transition table for the given timezone @@ -50,6 +54,7 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years; std::unique_ptr
make_timezone_transition_table( std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh index be7d19b2227..1758790cd64 100644 --- a/cpp/include/cudf_test/tdigest_utilities.cuh +++ b/cpp/include/cudf_test/tdigest_utilities.cuh @@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op) static_cast(values).type(), tdigest_gen{}, op, values, delta); // NOTE: an empty tdigest column still has 1 row. - auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); } @@ -562,12 +562,12 @@ template void tdigest_merge_empty(MergeFunc merge_op) { // 3 empty tdigests all in the same group - auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); - auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto b = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); cols.push_back(*b); @@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op) auto const delta = 1000; auto result = merge_op(*values, delta); - auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto expected = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index c83a4260c19..7c909f1a948 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -72,7 +73,7 @@ std::unique_ptr minhash( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -133,7 +134,7 @@ std::unique_ptr minhash64( * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty - * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * * @param input Strings column to compute minhash * @param seeds Seed values used for the hash algorithm @@ -150,5 +151,61 @@ std::unique_ptr minhash64( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the minhash values for each row of strings per seed + * + * Hash values are computed from each string in each row and the + * minimum hash value is returned for each row for each seed. + * Each row of the output list column are seed results for the corresponding + * input row. The order of the elements in each row match the order of + * the seeds provided in the `seeds` parameter. + * + * This function uses MurmurHash3_x64_128 for the hash algorithm though + * only the first 64-bits of the hash are used in computing the output. + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if seeds is empty + * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit + * + * @param input Lists column of strings to compute minhash + * @param seeds Seed values used for the hash algorithm + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr word_minhash64( + cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index fd9a6b8f5fe..ddb0dbcd96d 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -580,142 +580,167 @@ std::unique_ptr extract_quarter(column_view const& column, std::unique_ptr ceil_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::CEIL, freq, column, stream, mr); } std::unique_ptr floor_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::FLOOR, freq, column, stream, mr); } std::unique_ptr round_datetimes(column_view const& column, rounding_frequency freq, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::round_general( - detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr); + return detail::round_general(detail::rounding_function::ROUND, freq, column, stream, mr); } -std::unique_ptr extract_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_year(column, cudf::get_default_stream(), mr); + return detail::extract_year(column, stream, mr); } -std::unique_ptr extract_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_month(column, cudf::get_default_stream(), mr); + return detail::extract_month(column, stream, mr); } -std::unique_ptr extract_day(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_day(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_day(column, cudf::get_default_stream(), mr); + return detail::extract_day(column, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_weekday(column, cudf::get_default_stream(), mr); + return detail::extract_weekday(column, stream, mr); } -std::unique_ptr extract_hour(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_hour(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_hour(column, cudf::get_default_stream(), mr); + return detail::extract_hour(column, stream, mr); } -std::unique_ptr extract_minute(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_minute(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_minute(column, cudf::get_default_stream(), mr); + return detail::extract_minute(column, stream, mr); } -std::unique_ptr extract_second(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr extract_second(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_second(column, cudf::get_default_stream(), mr); + return detail::extract_second(column, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_millisecond_fraction(column, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_microsecond_fraction(column, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr); + return detail::extract_nanosecond_fraction(column, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::last_day_of_month(column, cudf::get_default_stream(), mr); + return detail::last_day_of_month(column, stream, mr); } -std::unique_ptr day_of_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr day_of_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::day_of_year(column, cudf::get_default_stream(), mr); + return detail::day_of_year(column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::column_view const& months_column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months( - timestamp_column, months_column, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months_column, stream, mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, cudf::scalar const& months, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr); + return detail::add_calendrical_months(timestamp_column, months, stream, mr); } -std::unique_ptr is_leap_year(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr is_leap_year(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::is_leap_year(column, cudf::get_default_stream(), mr); + return detail::is_leap_year(column, stream, mr); } -std::unique_ptr days_in_month(column_view const& column, rmm::device_async_resource_ref mr) +std::unique_ptr days_in_month(column_view const& column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::days_in_month(column, cudf::get_default_stream(), mr); + return detail::days_in_month(column, stream, mr); } std::unique_ptr extract_quarter(column_view const& column, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::extract_quarter(column, cudf::get_default_stream(), mr); + return detail::extract_quarter(column, stream, mr); } } // namespace datetime diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 6498a5e6c55..cf239297255 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -380,11 +380,11 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year) std::unique_ptr
make_timezone_transition_table(std::optional tzif_dir, std::string_view timezone_name, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::make_timezone_transition_table( - tzif_dir, timezone_name, cudf::get_default_stream(), mr); + return detail::make_timezone_transition_table(tzif_dir, timezone_name, stream, mr); } namespace detail { diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu new file mode 100644 index 00000000000..70d61132b42 --- /dev/null +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -0,0 +1,808 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/utilities/parsing_utils.cuh" +#include "io/utilities/string_parsing.hpp" +#include "nested_json.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::io::json::detail { + +/** + * @brief Get the column indices for the values column for array of arrays rows + * + * @param row_array_children_level The level of the row array's children + * @param d_tree The tree metadata + * @param col_ids The column ids + * @param num_columns The number of columns + * @param stream The stream to use + * @return The value columns' indices + */ +rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, + tree_meta_t const& d_tree, + device_span col_ids, + size_type const num_columns, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [level2_nodes, level2_indices] = get_array_children_indices( + row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); + auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); + rmm::device_uvector values_column_indices(num_columns, stream); + thrust::scatter(rmm::exec_policy(stream), + level2_indices.begin(), + level2_indices.end(), + col_id_location, + values_column_indices.begin()); + return values_column_indices; +} + +/** + * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. + * + * @param input String device buffer + * @param node_range_begin Begin offset of the strings + * @param node_range_end End offset of the strings + * @param stream CUDA stream + * @return Vector of strings + */ +std::vector copy_strings_to_host_sync( + device_span input, + device_span node_range_begin, + device_span node_range_end, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto const num_strings = node_range_begin.size(); + rmm::device_uvector string_offsets(num_strings, stream); + rmm::device_uvector string_lengths(num_strings, stream); + auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + thrust::transform(rmm::exec_policy(stream), + d_offset_pairs, + d_offset_pairs + num_strings, + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), + [] __device__(auto const& offsets) { + // Note: first character for non-field columns + return thrust::make_tuple( + static_cast(thrust::get<0>(offsets)), + static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); + }); + + cudf::io::parse_options_view options_view{}; + options_view.quotechar = '\0'; // no quotes + options_view.keepquotes = true; + auto d_offset_length_it = + thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); + auto d_column_names = parse_data(input.data(), + d_offset_length_it, + num_strings, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + options_view, + stream, + cudf::get_current_device_resource_ref()); + auto to_host = [stream](auto const& col) { + if (col.is_empty()) return std::vector{}; + auto const scv = cudf::strings_column_view(col); + auto const h_chars = cudf::detail::make_host_vector_async( + cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); + auto const h_offsets = cudf::detail::make_host_vector_async( + cudf::device_span(scv.offsets().data() + scv.offset(), + scv.size() + 1), + stream); + stream.synchronize(); + + // build std::string vector from chars and offsets + std::vector host_data; + host_data.reserve(col.size()); + std::transform( + std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + return host_data; + }; + return to_host(d_column_names->view()); +} + +/** + * @brief Checks if all strings in each string column in the tree are nulls. + * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as + * false. + * + * @param input Input JSON string device data + * @param d_column_tree column tree representation of JSON string + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param options Parsing options specifying the parsing behaviour + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Array of bytes where each byte indicate if it is all nulls string column. + */ +rmm::device_uvector is_all_nulls_each_column(device_span input, + tree_meta_t const& d_column_tree, + tree_meta_t const& tree, + device_span col_ids, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_cols = d_column_tree.node_categories.size(); + rmm::device_uvector is_all_nulls(num_cols, stream); + thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + + auto parse_opt = parsing_options(options, stream); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; + } + }); + return is_all_nulls; +} + +NodeIndexT get_row_array_parent_col_id(device_span col_ids, + bool is_enabled_lines, + rmm::cuda_stream_view stream) +{ + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + } + return value; +} +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream); + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto const num_nodes = col_ids.size(); + rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy + thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = + get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); + + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + auto num_columns = d_unique_col_ids.size(); + std::vector column_names = copy_strings_to_host_sync( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + // array of arrays column names + if (is_array_of_arrays) { + auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; + auto values_column_indices = + get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); + auto h_values_column_indices = + cudf::detail::make_host_vector_sync(values_column_indices, stream); + std::transform(unique_col_ids.begin(), + unique_col_ids.end(), + column_names.begin(), + column_names.begin(), + [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( + auto col_id, auto name) mutable { + return column_parent_ids[col_id] == row_array_parent_col_id + ? std::to_string(h_values_column_indices[col_id]) + : name; + }); + } + + auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { + if (is_enabled_mixed_types_as_string) { + return cudf::detail::make_std_vector_sync( + is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); + } + return std::vector(); + }(); + auto [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); + + scatter_offsets(tree, + col_ids, + row_offsets, + node_ids, + sorted_col_ids, + d_column_tree, + ignore_vals, + columns, + stream); +} + +std::pair, + std::unordered_map>> +build_tree(device_json_column& root, + std::vector const& is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); + auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); + auto num_columns = d_unique_col_ids.size(); + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: [[fallthrough]]; + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + auto init_to_zero = [stream](auto& v) { + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); + }; + + auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { + if (column_category == NC_ERR || column_category == NC_FN) { + return; + } else if (column_category == NC_VAL || column_category == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + } else if (column_category == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + init_to_zero(col.child_offsets); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = to_json_col_type(column_category); + }; + + auto reinitialize_as_string = [&](auto i, auto& col) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + init_to_zero(col.string_offsets); + init_to_zero(col.string_lengths); + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = json_col_t::StringColumn; + // destroy references of all child columns after this step, by calling remove_child_columns + }; + + path_from_tree tree_path{column_categories, + column_parent_ids, + column_names, + is_array_of_arrays, + row_array_parent_col_id}; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + + // use hash map because we may skip field name's col_ids + std::unordered_map> columns; + // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking + std::map, NodeIndexT> mapped_columns; + // find column_ids which are values, but should be ignored in validity + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + std::vector is_mixed_type_column(num_columns, 0); + std::vector is_pruned(num_columns, 0); + // for columns that are not mixed type but have been forced as string + std::vector forced_as_string_column(num_columns); + columns.try_emplace(parent_node_sentinel, std::ref(root)); + + std::function remove_child_columns = + [&](NodeIndexT this_col_id, device_json_column& col) { + for (auto col_name : col.column_order) { + auto child_id = mapped_columns[{this_col_id, col_name}]; + is_mixed_type_column[child_id] = 1; + remove_child_columns(child_id, col.child_columns.at(col_name)); + mapped_columns.erase({this_col_id, col_name}); + columns.erase(child_id); + } + col.child_columns.clear(); // their references are deleted above. + col.column_order.clear(); + }; + + auto name_and_parent_index = [&is_array_of_arrays, + &row_array_parent_col_id, + &column_parent_ids, + &column_categories, + &column_names](auto this_col_id) { + std::string name = ""; + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { + if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { + name = column_names[this_col_id]; + } else { + name = list_child_name; + } + } else if (column_categories[parent_col_id] == NC_FN) { + auto field_name_col_id = parent_col_id; + parent_col_id = column_parent_ids[parent_col_id]; + name = column_names[field_name_col_id]; + } else { + CUDF_FAIL("Unexpected parent column category"); + } + return std::pair{name, parent_col_id}; + }; + + // Prune columns that are not required to be parsed. + if (options.is_enabled_prune_columns()) { + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + // get path of this column, and get its dtype if present in options + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { + is_pruned[this_col_id] = 1; + continue; + } else { + // make sure all its parents are not pruned. + while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { + is_pruned[parent_col_id] = 0; + parent_col_id = column_parent_ids[parent_col_id]; + } + } + } + } + + // Build the column tree, also, handles mixed types. + for (auto const this_col_id : unique_col_ids) { + if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { + continue; + } + // Struct, List, String, Value + auto [name, parent_col_id] = name_and_parent_index(this_col_id); + + // if parent is mixed type column or this column is pruned or if parent + // has been forced as string, ignore this column. + if (parent_col_id != parent_node_sentinel && + (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || + forced_as_string_column[parent_col_id]) { + ignore_vals[this_col_id] = 1; + if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } + if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } + continue; + } + + // If the child is already found, + // replace if this column is a nested column and the existing was a value column + // ignore this column if this column is a value column and the existing was a nested column + auto it = columns.find(parent_col_id); + CUDF_EXPECTS(it != columns.end(), "Parent column not found"); + auto& parent_col = it->second.get(); + bool replaced = false; + if (mapped_columns.count({parent_col_id, name}) > 0) { + auto const old_col_id = mapped_columns[{parent_col_id, name}]; + // If mixed type as string is enabled, make both of them strings and merge them. + // All child columns will be ignored when parsing. + if (is_enabled_mixed_types_as_string) { + bool const is_mixed_type = [&]() { + // If new or old is STR and they are all not null, make it mixed type, else ignore. + if (column_categories[this_col_id] == NC_VAL || + column_categories[this_col_id] == NC_STR) { + if (is_str_column_all_nulls[this_col_id]) return false; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + if (is_str_column_all_nulls[old_col_id]) return false; + } + return true; + }(); + if (is_mixed_type) { + is_mixed_type_column[this_col_id] = 1; + is_mixed_type_column[old_col_id] = 1; + // if old col type (not cat) is list or struct, replace with string. + auto& col = columns.at(old_col_id).get(); + if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { + reinitialize_as_string(old_col_id, col); + remove_child_columns(old_col_id, col); + // all its children (which are already inserted) are ignored later. + } + col.forced_as_string_column = true; + columns.try_emplace(this_col_id, columns.at(old_col_id)); + continue; + } + } + + if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { + ignore_vals[this_col_id] = 1; + continue; + } + if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { + // remap + ignore_vals[old_col_id] = 1; + mapped_columns.erase({parent_col_id, name}); + columns.erase(old_col_id); + parent_col.child_columns.erase(name); + replaced = true; // to skip duplicate name in column_order + } else { + // If this is a nested column but we're trying to insert either (a) a list node into a + // struct column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and + column_categories[this_col_id] == NC_STRUCT) or + (column_categories[old_col_id] == NC_STRUCT and + column_categories[this_col_id] == NC_LIST)), + "A mix of lists and structs within the same column is not supported"); + } + } + + auto this_column_category = column_categories[this_col_id]; + // get path of this column, check if it is a struct/list forced as string, and enforce it + auto const nt = tree_path.get_path(this_col_id); + std::optional const user_dtype = get_path_data_type(nt, options); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + this_column_category = NC_STR; + } + + CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); + // move into parent + device_json_column col(stream, mr); + initialize_json_columns(this_col_id, col, this_column_category); + if ((column_categories[this_col_id] == NC_STRUCT or + column_categories[this_col_id] == NC_LIST) and + user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { + col.forced_as_string_column = true; + forced_as_string_column[this_col_id] = true; + } + + auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + if (not replaced) parent_col.column_order.push_back(name); + columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); + mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); + } + + if (is_enabled_mixed_types_as_string) { + // ignore all children of mixed type columns + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { + is_mixed_type_column[this_col_id] = 1; + ignore_vals[this_col_id] = 1; + columns.erase(this_col_id); + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and + is_mixed_type_column[this_col_id] == 1) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + } + + // ignore all children of columns forced as string + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { + forced_as_string_column[this_col_id] = true; + ignore_vals[this_col_id] = 1; + } + // Convert only mixed type columns as string (so to copy), but not its children + if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and + forced_as_string_column[this_col_id]) + column_categories[this_col_id] = NC_STR; + } + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), + column_categories.data(), + column_categories.size() * sizeof(column_categories[0]), + cudf::detail::host_memory_kind::PAGEABLE, + stream); + + // restore unique_col_ids order + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<1>(a) < thrust::get<1>(b); + }); + return {ignore_vals, columns}; +} + +void scatter_offsets( + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t& d_column_tree, + host_span ignore_vals, + std::unordered_map>& columns, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_columns = d_column_tree.node_categories.size(); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, cudf::get_current_device_resource_ref()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} + +} // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 8890c786287..b08fd139113 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -35,23 +35,16 @@ #include #include -#include #include #include #include -#include #include #include #include #include -#include -#include #include #include -#include -#include - namespace cudf::io::json::detail { // DEBUG prints @@ -296,651 +289,6 @@ reduce_to_column_tree(tree_meta_t& tree, std::move(max_row_offsets)}; } -/** - * @brief Get the column indices for the values column for array of arrays rows - * - * @param row_array_children_level The level of the row array's children - * @param d_tree The tree metadata - * @param col_ids The column ids - * @param num_columns The number of columns - * @param stream The stream to use - * @return The value columns' indices - */ -rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, - tree_meta_t const& d_tree, - device_span col_ids, - size_type const num_columns, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto [level2_nodes, level2_indices] = get_array_children_indices( - row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); - auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); - rmm::device_uvector values_column_indices(num_columns, stream); - thrust::scatter(rmm::exec_policy(stream), - level2_indices.begin(), - level2_indices.end(), - col_id_location, - values_column_indices.begin()); - return values_column_indices; -} - -/** - * @brief Copies strings specified by pair of begin, end offsets to host vector of strings. - * - * @param input String device buffer - * @param node_range_begin Begin offset of the strings - * @param node_range_end End offset of the strings - * @param stream CUDA stream - * @return Vector of strings - */ -std::vector copy_strings_to_host_sync( - device_span input, - device_span node_range_begin, - device_span node_range_end, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - auto const num_strings = node_range_begin.size(); - rmm::device_uvector string_offsets(num_strings, stream); - rmm::device_uvector string_lengths(num_strings, stream); - auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - thrust::transform(rmm::exec_policy(stream), - d_offset_pairs, - d_offset_pairs + num_strings, - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), - [] __device__(auto const& offsets) { - // Note: first character for non-field columns - return thrust::make_tuple( - static_cast(thrust::get<0>(offsets)), - static_cast(thrust::get<1>(offsets) - thrust::get<0>(offsets))); - }); - - cudf::io::parse_options_view options_view{}; - options_view.quotechar = '\0'; // no quotes - options_view.keepquotes = true; - auto d_offset_length_it = - thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()); - auto d_column_names = parse_data(input.data(), - d_offset_length_it, - num_strings, - data_type{type_id::STRING}, - rmm::device_buffer{}, - 0, - options_view, - stream, - cudf::get_current_device_resource_ref()); - auto to_host = [stream](auto const& col) { - if (col.is_empty()) return std::vector{}; - auto const scv = cudf::strings_column_view(col); - auto const h_chars = cudf::detail::make_host_vector_async( - cudf::device_span(scv.chars_begin(stream), scv.chars_size(stream)), stream); - auto const h_offsets = cudf::detail::make_host_vector_async( - cudf::device_span(scv.offsets().data() + scv.offset(), - scv.size() + 1), - stream); - stream.synchronize(); - - // build std::string vector from chars and offsets - std::vector host_data; - host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); - return host_data; - }; - return to_host(d_column_names->view()); -} - -/** - * @brief Checks if all strings in each string column in the tree are nulls. - * For non-string columns, it's set as true. If any of rows in a string column is false, it's set as - * false. - * - * @param input Input JSON string device data - * @param d_column_tree column tree representation of JSON string - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param options Parsing options specifying the parsing behaviour - * @param stream CUDA stream used for device memory operations and kernel launches - * @return Array of bytes where each byte indicate if it is all nulls string column. - */ -rmm::device_uvector is_all_nulls_each_column(device_span input, - tree_meta_t const& d_column_tree, - tree_meta_t const& tree, - device_span col_ids, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_cols = d_column_tree.node_categories.size(); - rmm::device_uvector is_all_nulls(num_cols, stream); - thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); - - auto parse_opt = parsing_options(options, stream); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are insert into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_FUNC_RANGE(); - - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key( - rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = [&]() { - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; - }(); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); - auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // array of arrays column names - if (is_array_of_arrays) { - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_host_vector_sync(values_column_indices, stream); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.begin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { - if (column_category == NC_ERR || column_category == NC_FN) { - return; - } else if (column_category == NC_VAL || column_category == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_category == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_category); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { - if (is_enabled_mixed_types_as_string) { - return cudf::detail::make_host_vector_sync( - is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); - } - return cudf::detail::make_empty_host_vector(0, stream); - }(); - - // use hash map because we may skip field name's col_ids - std::unordered_map> columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) { - ignore_vals[this_col_id] = 1; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = 1; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = 1; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - auto this_column_category = column_categories[this_col_id]; - if (is_enabled_mixed_types_as_string) { - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - is_mixed_type_column[this_col_id] = 1; - this_column_category = NC_STR; - } - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col, this_column_category); - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); - } - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, cudf::get_current_device_resource_ref()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, cudf::get_current_device_resource_ref()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; - } - }); - - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } - } - stream.synchronize(); -} - std::pair, std::vector> device_json_column_to_cudf_column( device_json_column& json_col, device_span d_input, @@ -982,39 +330,58 @@ std::pair, std::vector> device_json_co "string offset, string length mismatch"); rmm::device_uvector d_string_data(col_size, stream); // TODO how about directly storing pair in json_column? - auto offset_length_it = - thrust::make_zip_iterator(json_col.string_offsets.begin(), json_col.string_lengths.begin()); - data_type target_type{}; + auto [result_bitmask, null_count] = make_validity(json_col); - if (schema.has_value()) { + data_type target_type{}; + std::unique_ptr col{}; + if (options.normalize_whitespace && json_col.forced_as_string_column) { + CUDF_EXPECTS(prune_columns || options.mixed_types_as_string, + "Whitespace normalization of nested columns requested as string requires " + "either prune_columns or mixed_types_as_string to be enabled"); + auto [normalized_d_input, col_offsets, col_lengths] = + cudf::io::json::detail::normalize_whitespace( + d_input, json_col.string_offsets, json_col.string_lengths, stream, mr); + auto offset_length_it = thrust::make_zip_iterator(col_offsets.begin(), col_lengths.begin()); + target_type = data_type{type_id::STRING}; + // Convert strings to the inferred data type + col = parse_data(normalized_d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); + } else { + auto offset_length_it = thrust::make_zip_iterator(json_col.string_offsets.begin(), + json_col.string_lengths.begin()); + if (schema.has_value()) { #ifdef NJP_DEBUG_PRINT - std::cout << "-> explicit type: " - << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) - : "n/a"); + std::cout << "-> explicit type: " + << (schema.has_value() ? std::to_string(static_cast(schema->type.id())) + : "n/a"); #endif - target_type = schema.value().type; - } else if (json_col.forced_as_string_column) { - target_type = data_type{type_id::STRING}; - } - // Infer column type, if we don't have an explicit type for it - else { - target_type = cudf::io::detail::infer_data_type( - options.json_view(), d_input, offset_length_it, col_size, stream); + target_type = schema.value().type; + } + // Infer column type, if we don't have an explicit type for it + else { + target_type = cudf::io::detail::infer_data_type( + options.json_view(), d_input, offset_length_it, col_size, stream); + } + // Convert strings to the inferred data type + col = parse_data(d_input.data(), + offset_length_it, + col_size, + target_type, + std::move(result_bitmask), + null_count, + options.view(), + stream, + mr); } - auto [result_bitmask, null_count] = make_validity(json_col); - // Convert strings to the inferred data type - auto col = parse_data(d_input.data(), - offset_length_it, - col_size, - target_type, - std::move(result_bitmask), - null_count, - options.view(), - stream, - mr); - // Reset nullable if we do not have nulls // This is to match the existing JSON reader's behaviour: // - Non-string columns will always be returned as nullable @@ -1120,11 +487,15 @@ table_with_metadata device_parse_nested_json(device_span d_input, const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref()); // gpu tree generation - return get_tree_representation(tokens_gpu, - token_indices_gpu, - options.is_enabled_mixed_types_as_string(), - stream, - cudf::get_current_device_resource_ref()); + // Note that to normalize whitespaces in nested columns coerced to be string, we need the column + // to either be of mixed type or we need to request the column to be returned as string by + // pruning it with the STRING dtype + return get_tree_representation( + tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string() || options.is_enabled_prune_columns(), + stream, + cudf::get_current_device_resource_ref()); }(); // IILE used to free memory of token data. #ifdef NJP_DEBUG_PRINT auto h_input = cudf::detail::make_host_vector_async(d_input, stream); diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 97d5884fef1..2d435dc8e1a 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -17,6 +17,7 @@ #include "io/fst/lookup_tables.cuh" #include +#include #include #include #include @@ -25,8 +26,17 @@ #include #include #include - +#include + +#include +#include +#include +#include +#include +#include #include +#include +#include #include #include @@ -215,14 +225,6 @@ std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. * - * NOTE: An important case NOT handled by this FST is that of whitespace following newline - * characters within a string. Consider the following example - * Input: {"a":"x\n y"} - * FST output: {"a":"x\ny"} - * Expected output: {"a":"x\n y"} - * Such strings are not part of the JSON standard (characters allowed within quotes should - * have ASCII at least 0x20 i.e. space character and above) but may be encountered while - * reading JSON files */ enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; // Aliases for readability of the transition table @@ -255,17 +257,17 @@ struct TransduceToNormalizedWS { // Let the alphabet set be Sigma // --------------------------------------- // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol + // Input symbol translates to output symbol // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // OOS | Sigma\{,\t} -> Sigma\{,\t} - // DEC | Sigma -> Sigma + // DQS | Sigma -> + // OOS | Sigma\{,\t} -> + // DEC | Sigma -> // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {} -> - // OOS | {\t} -> + // Output symbol same as input symbol + // OOS | {} -> {} + // OOS | {\t} -> {\t} - // Case when read symbol is a space or tab but is unquoted + // Case when read symbol is not an unquoted space or tab // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function // However, since there is no output in this case i.e. the count returned by // operator()(state_id, match_id, read_symbol) is zero, this function is never called. @@ -287,8 +289,8 @@ struct TransduceToNormalizedWS { SymbolT const read_symbol) const { // Case when read symbol is a space or tab but is unquoted - if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && - state_id == static_cast(dfa_states::TT_OOS)) { + if (!(match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && + state_id == static_cast(dfa_states::TT_OOS))) { return 0; } return 1; @@ -328,33 +330,126 @@ void normalize_single_quotes(datasource::owning_buffer& inda std::swap(indata, outdata); } -void normalize_whitespace(datasource::owning_buffer& indata, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std:: + tuple, rmm::device_uvector, rmm::device_uvector> + normalize_whitespace(device_span d_input, + device_span col_offsets, + device_span col_lengths, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - static constexpr std::int32_t min_out = 0; - static constexpr std::int32_t max_out = 2; + /* + * Algorithm: + 1. Create a single buffer by concatenating the rows of the string column. Create segment offsets + and lengths array for concatenated buffer + 2. Run a whitespace normalization FST that performs NOP for non-whitespace and quoted + whitespace characters, and outputs indices of unquoted whitespace characters + 3. Update segment lengths based on the number of output indices between segment offsets + 4. Remove characters at output indices from concatenated buffer. + 5. Return updated buffer, segment lengths and updated segment offsets + */ + auto inbuf_lengths = cudf::detail::make_device_uvector_async( + col_lengths, stream, cudf::get_current_device_resource_ref()); + size_t inbuf_lengths_size = inbuf_lengths.size(); + size_type inbuf_size = + thrust::reduce(rmm::exec_policy_nosync(stream), inbuf_lengths.begin(), inbuf_lengths.end()); + rmm::device_uvector inbuf(inbuf_size, stream); + rmm::device_uvector inbuf_offsets(inbuf_lengths_size, stream); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + auto input_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [d_input = d_input.begin(), col_offsets = col_offsets.begin()] __device__( + size_t i) -> char const* { return &d_input[col_offsets[i]]; })); + auto output_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [inbuf = inbuf.begin(), inbuf_offsets = inbuf_offsets.cbegin()] __device__( + size_t i) -> char* { return &inbuf[inbuf_offsets[i]]; })); + + { + // cub device batched copy + size_t temp_storage_bytes = 0; + cub::DeviceCopy::Batched(nullptr, + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + rmm::device_buffer temp_storage(temp_storage_bytes, stream); + cub::DeviceCopy::Batched(temp_storage.data(), + temp_storage_bytes, + input_it, + output_it, + inbuf_lengths.begin(), + inbuf_lengths_size, + stream.value()); + } + + // whitespace normalization : get the indices of the unquoted whitespace characters auto parser = fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), - fst::detail::make_translation_functor( + fst::detail::make_translation_functor( normalize_whitespace::TransduceToNormalizedWS{}), stream); - rmm::device_buffer outbuf(indata.size(), stream, mr); - rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(reinterpret_cast(indata.data()), - static_cast(indata.size()), - static_cast(outbuf.data()), + rmm::device_uvector outbuf_indices(inbuf.size(), stream, mr); + rmm::device_scalar outbuf_indices_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), thrust::make_discard_iterator(), - outbuf_size.data(), + outbuf_indices.data(), + outbuf_indices_size.data(), normalize_whitespace::start_state, stream); - outbuf.resize(outbuf_size.value(stream), stream); - datasource::owning_buffer outdata(std::move(outbuf)); - std::swap(indata, outdata); + auto const num_deletions = outbuf_indices_size.value(stream); + outbuf_indices.resize(num_deletions, stream); + + // now these indices need to be removed + // TODO: is there a better way to do this? + thrust::for_each( + rmm::exec_policy_nosync(stream), + outbuf_indices.begin(), + outbuf_indices.end(), + [inbuf_offsets_begin = inbuf_offsets.begin(), + inbuf_offsets_end = inbuf_offsets.end(), + inbuf_lengths = inbuf_lengths.begin()] __device__(size_type idx) { + auto it = thrust::upper_bound(thrust::seq, inbuf_offsets_begin, inbuf_offsets_end, idx); + auto pos = thrust::distance(inbuf_offsets_begin, it) - 1; + cuda::atomic_ref ref{*(inbuf_lengths + pos)}; + ref.fetch_add(-1, cuda::std::memory_order_relaxed); + }); + + auto stencil = cudf::detail::make_zeroed_device_uvector_async( + static_cast(inbuf_size), stream, cudf::get_current_device_resource_ref()); + thrust::scatter(rmm::exec_policy_nosync(stream), + thrust::make_constant_iterator(true), + thrust::make_constant_iterator(true) + num_deletions, + outbuf_indices.begin(), + stencil.begin()); + thrust::remove_if(rmm::exec_policy_nosync(stream), + inbuf.begin(), + inbuf.end(), + stencil.begin(), + thrust::identity()); + inbuf.resize(inbuf_size - num_deletions, stream); + + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + inbuf_lengths.begin(), + inbuf_lengths.end(), + inbuf_offsets.begin(), + 0); + + stream.synchronize(); + return std::tuple{std::move(inbuf), std::move(inbuf_offsets), std::move(inbuf_lengths)}; } } // namespace detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 75639a0438f..83f71e657a7 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -299,22 +299,58 @@ get_array_children_indices(TreeDepthT row_array_children_level, device_span node_levels, device_span parent_node_ids, rmm::cuda_stream_view stream); + /** - * @brief Reduce node tree into column tree by aggregating each property of column. + * @brief Reduces node tree representation to column tree representation. * - * @param tree json node tree to reduce (modified in-place, but restored to original state) - * @param col_ids column ids of each node (modified in-place, but restored to original state) - * @param row_offsets row offsets of each node (modified in-place, but restored to original state) - * @param stream The CUDA stream to which kernels are dispatched - * @return A tuple containing the column tree, identifier for each column and the maximum row index - * in each column + * @param tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns */ std::tuple, rmm::device_uvector> reduce_to_column_tree(tree_meta_t& tree, - device_span col_ids, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); - +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are insert into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4e513d3495c..1c15e147b13 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2079,10 +2079,12 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; - parse_opts.dayfirst = options.is_enabled_dayfirst(); - parse_opts.keepquotes = options.is_enabled_keep_quotes(); - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.dayfirst = options.is_enabled_dayfirst(); + parse_opts.keepquotes = options.is_enabled_keep_quotes(); + parse_opts.normalize_whitespace = options.is_enabled_normalize_whitespace(); + parse_opts.mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); std::vector na_values{"", "null"}; na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index bd82b040359..99a5b17bce8 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -232,12 +232,6 @@ table_with_metadata read_batch(host_span> sources, normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); } - // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is - // enabled, invoke pre-processing FST - if (reader_opts.is_enabled_normalize_whitespace()) { - normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref()); - } - auto buffer = cudf::device_span(reinterpret_cast(bufview.data()), bufview.size()); stream.synchronize(); diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index a3f91f6859b..9ed2929a70e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s, { bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; - constexpr int batch_size = 32; + constexpr int batch_size = cudf::detail::warp_size; int cur_leaf_count = target_leaf_count; while (s->error == 0 && s->nz_count < target_leaf_count && s->input_value_count < s->num_input_values) { diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 5d10472b0ae..7c985643887 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -203,10 +203,9 @@ struct SchemaElement { bool operator==(SchemaElement const& other) const { return type == other.type && converted_type == other.converted_type && - type_length == other.type_length && repetition_type == other.repetition_type && - name == other.name && num_children == other.num_children && - decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision && - field_id == other.field_id; + type_length == other.type_length && name == other.name && + num_children == other.num_children && decimal_scale == other.decimal_scale && + decimal_precision == other.decimal_precision && field_id == other.field_id; } // the parquet format is a little squishy when it comes to interpreting diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 125d35f6499..1390339c1ae 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -400,7 +400,8 @@ struct ColumnChunkDesc { int32_t src_col_schema_, column_chunk_info const* chunk_info_, float list_bytes_per_row_est_, - bool strings_to_categorical_) + bool strings_to_categorical_, + int32_t src_file_idx_) : compressed_data(compressed_data_), compressed_size(compressed_size_), num_values(num_values_), @@ -419,7 +420,8 @@ struct ColumnChunkDesc { src_col_schema(src_col_schema_), h_chunk_info(chunk_info_), list_bytes_per_row_est(list_bytes_per_row_est_), - is_strings_to_cat(strings_to_categorical_) + is_strings_to_cat(strings_to_categorical_), + src_file_idx(src_file_idx_) { } @@ -456,6 +458,7 @@ struct ColumnChunkDesc { bool is_strings_to_cat{}; // convert strings to hashes bool is_large_string_col{}; // `true` if string data uses 64-bit offsets + int32_t src_file_idx{}; // source file index }; /** diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 245e1829c72..c588fedb85c 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -1511,10 +1511,13 @@ void reader::impl::create_global_chunk_info() std::transform( _input_columns.begin(), _input_columns.end(), column_mapping.begin(), [&](auto const& col) { // translate schema_idx into something we can use for the page indexes - if (auto it = std::find_if( - columns.begin(), - columns.end(), - [&col](auto const& col_chunk) { return col_chunk.schema_idx == col.schema_idx; }); + if (auto it = std::find_if(columns.begin(), + columns.end(), + [&](auto const& col_chunk) { + return col_chunk.schema_idx == + _metadata->map_schema_index(col.schema_idx, + rg.source_index); + }); it != columns.end()) { return std::distance(columns.begin(), it); } @@ -1535,7 +1538,8 @@ void reader::impl::create_global_chunk_info() auto col = _input_columns[i]; // look up metadata auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); + auto& schema = _metadata->get_schema( + _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index); auto [clock_rate, logical_type] = conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()), @@ -1574,9 +1578,9 @@ void reader::impl::create_global_chunk_info() col.schema_idx, chunk_info, list_bytes_per_row_est, - schema.type == BYTE_ARRAY and _strings_to_categorical)); + schema.type == BYTE_ARRAY and _strings_to_categorical, + rg.source_index)); } - // Adjust for skip_rows when updating the remaining rows after the first group remaining_rows -= (skip_rows) ? std::min(rg.start_row + row_group.num_rows - skip_rows, remaining_rows) diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 8b5678f202b..6d566b5815e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf std::vector chunks(rg.columns.size()); for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) { - auto const& col_chunk = rg.columns[col_idx]; - auto& schema = get_schema(col_chunk.schema_idx); + auto const& col_chunk = rg.columns[col_idx]; + auto const is_schema_idx_mapped = + is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index); + auto const mapped_schema_idx = is_schema_idx_mapped + ? map_schema_index(col_chunk.schema_idx, rg_info.source_index) + : col_chunk.schema_idx; + auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0); auto const max_def_level = schema.max_definition_level; auto const max_rep_level = schema.max_repetition_level; @@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata( num_rows(calc_num_rows()), num_row_groups(calc_num_row_groups()) { - // Validate that all sources have the same schema unless we are reading select columns - // from mismatched sources, in which case, we will only check the projected columns later. - if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) { - auto const& first_meta = per_file_metadata.front(); + if (per_file_metadata.size() > 1) { + auto& first_meta = per_file_metadata.front(); auto const num_cols = first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0; - auto const& schema = first_meta.schema; - - // Verify that the input files have matching numbers of columns and schema. - for (auto const& pfm : per_file_metadata) { - if (pfm.row_groups.size() > 0) { - CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(), - "All sources must have the same number of columns"); + auto& schema = first_meta.schema; + + // Validate that all sources have the same schema unless we are reading select columns + // from mismatched sources, in which case, we will only check the projected columns later. + if (not has_cols_from_mismatched_srcs) { + // Verify that the input files have matching numbers of columns and schema. + for (auto const& pfm : per_file_metadata) { + if (pfm.row_groups.size() > 0) { + CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(), + "All sources must have the same number of columns"); + } + CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema"); } - CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema"); } + + // Mark the column schema in the first (default) source as nullable if it is nullable in any of + // the input sources. This avoids recomputing this within build_column() and + // populate_metadata(). + std::for_each( + thrust::make_counting_iterator(static_cast(1)), + thrust::make_counting_iterator(schema.size()), + [&](auto const schema_idx) { + if (schema[schema_idx].repetition_type == REQUIRED and + std::any_of( + per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) { + return pfm.schema[schema_idx].repetition_type != REQUIRED; + })) { + schema[schema_idx].repetition_type = OPTIONAL; + } + }); } // Collect and apply arrow:schema from Parquet's key value metadata section @@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t size_type src_idx, int schema_idx) const { - // schema_idx_maps will only have > 0 size when we are reading matching column projection from - // mismatched Parquet sources. - if (src_idx and not schema_idx_maps.empty()) { - auto const& schema_idx_map = schema_idx_maps[src_idx - 1]; - CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(), - "Unmapped schema index encountered in the specified source tree", - std::range_error); - schema_idx = schema_idx_map.at(schema_idx); - } + // Map schema index to the provided source file index + schema_idx = map_schema_index(schema_idx, src_idx); auto col = std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(), @@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const return rg_metadata; } +bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const +{ + // Check if schema_idx or pfm_idx is invalid + CUDF_EXPECTS( + schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), + "Parquet reader encountered an invalid schema_idx or pfm_idx", + std::out_of_range); + + // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e. + // schemas are identical). + if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; } + + // Check if mapped + auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + return schema_idx_map.find(schema_idx) != schema_idx_map.end(); +} + +int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const +{ + // Check if schema_idx or pfm_idx is invalid + CUDF_EXPECTS( + schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), + "Parquet reader encountered an invalid schema_idx or pfm_idx", + std::out_of_range); + + // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e. + // schemas are identical). + if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; } + + // schema_idx_maps will only have > 0 size when we are reading matching column projection from + // mismatched Parquet sources. + auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(), + "Unmapped schema index encountered in the specified source tree", + std::out_of_range); + + // Return the mapped schema idx. + return schema_idx_map.at(schema_idx); +} + std::string aggregate_reader_metadata::get_pandas_index() const { // Assumes that all input files have the same metadata @@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns( // Compares two schema elements to be equal except their number of children auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) { return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and - lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and - lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and + lhs.type_length == rhs.type_length and lhs.name == rhs.name and + lhs.decimal_scale == rhs.decimal_scale and lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id; }; @@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns( "the selected path", std::invalid_argument); + // Get the schema_idx_map for this data source (pfm) + auto& schema_idx_map = schema_idx_maps[pfm_idx - 1]; + // Map the schema index from 0th tree (src) to the one in the current (dst) tree. + schema_idx_map[src_schema_idx] = dst_schema_idx; + // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer // hierarchy. So continue on with mapping. if (src_schema_elem.is_stub()) { @@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns( pfm_idx); }); } - - // We're at a leaf and this is an input column (one with actual data stored) so map it. - if (src_schema_elem.num_children == 0) { - // Get the schema_idx_map for this data source (pfm) - auto& schema_idx_map = schema_idx_maps[pfm_idx - 1]; - - // Map the schema index from 0th tree (src) to the one in the current (dst) tree. - schema_idx_map[src_schema_idx] = dst_schema_idx; - } }; std::vector output_column_schemas; diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 6f2863136b2..6487c92f48f 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -234,6 +234,26 @@ class aggregate_reader_metadata { [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; } + /** + * @brief Checks if a schema index from 0th source is mapped to the specified file index + * + * @param schema_idx The index of the SchemaElement in the zeroth file. + * @param pfm_idx The index of the file (per_file_metadata) to check mappings for. + * + * @return True if schema index is mapped + */ + [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const; + + /** + * @brief Maps schema index from 0th source file to the specified file index + * + * @param schema_idx The index of the SchemaElement in the zeroth file. + * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to. + * + * @return Mapped schema index + */ + [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const; + /** * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file * @@ -248,7 +268,7 @@ class aggregate_reader_metadata { CUDF_EXPECTS( schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast(per_file_metadata.size()), "Parquet reader encountered an invalid schema_idx or pfm_idx", - std::invalid_argument); + std::out_of_range); return per_file_metadata[pfm_idx].schema[schema_idx]; } @@ -256,7 +276,10 @@ class aggregate_reader_metadata { [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); } /** - * @brief Gets the concrete nesting depth of output cudf columns + * @brief Gets the concrete nesting depth of output cudf columns. + * + * Gets the nesting depth of the output cudf column for the given schema. + * The nesting depth must be equal for the given schema_index across all sources. * * @param schema_index Schema index of the input column * diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 52918f5bc80..8e67f233213 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_str * is indicated when adding new values. This function generates the mappings of * the R/D levels to those start/end bounds * - * @param remap Maps column schema index to the R/D remapping vectors for that column - * @param src_col_schema The column schema to generate the new mapping for + * @param remap Maps column schema index to the R/D remapping vectors for that column for a + * particular input source file + * @param src_col_schema The source column schema to generate the new mapping for + * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file + * @param src_file_idx The input source file index for the column schema * @param md File metadata information */ -void generate_depth_remappings(std::map, std::vector>>& remap, - int src_col_schema, - aggregate_reader_metadata const& md) +void generate_depth_remappings( + std::map, std::pair, std::vector>>& remap, + int const src_col_schema, + int const mapped_src_col_schema, + int const src_file_idx, + aggregate_reader_metadata const& md) { // already generated for this level - if (remap.find(src_col_schema) != remap.end()) { return; } - auto schema = md.get_schema(src_col_schema); - int max_depth = md.get_output_nesting_depth(src_col_schema); + if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; } + auto const& schema = md.get_schema(mapped_src_col_schema, src_file_idx); + auto const max_depth = md.get_output_nesting_depth(src_col_schema); - CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(), + CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(), "Attempting to remap a schema more than once"); auto inserted = - remap.insert(std::pair, std::vector>>{src_col_schema, {}}); + remap.insert(std::pair, std::pair, std::vector>>{ + {src_col_schema, src_file_idx}, {}}); auto& depth_remap = inserted.first->second; std::vector& rep_depth_remap = (depth_remap.first); @@ -136,15 +143,15 @@ void generate_depth_remappings(std::map, std::ve auto find_shallowest = [&](int r) { int shallowest = -1; int cur_depth = max_depth - 1; - int schema_idx = src_col_schema; + int schema_idx = mapped_src_col_schema; while (schema_idx > 0) { - auto cur_schema = md.get_schema(schema_idx); + auto& cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_repetition_level == r) { // if this is a repeated field, map it one level deeper shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth; } // if it's one-level encoding list - else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) { + else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) { shallowest = cur_depth - 1; } if (!cur_schema.is_stub()) { cur_depth--; } @@ -159,10 +166,10 @@ void generate_depth_remappings(std::map, std::ve for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) { auto find_deepest = [&](int d) { SchemaElement prev_schema; - int schema_idx = src_col_schema; + int schema_idx = mapped_src_col_schema; int r1 = 0; while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); + SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_definition_level == d) { // if this is a repeated field, map it one level deeper r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level @@ -175,10 +182,10 @@ void generate_depth_remappings(std::map, std::ve // we now know R1 from above. return the deepest nesting level that has the // same repetition level - schema_idx = src_col_schema; + schema_idx = mapped_src_col_schema; int depth = max_depth - 1; while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); + SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx); if (cur_schema.max_repetition_level == r1) { // if this is a repeated field, map it one level deeper depth = cur_schema.is_stub() ? depth + 1 : depth; @@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info() std::vector per_page_nesting_info_size(num_columns); auto iter = thrust::make_counting_iterator(size_type{0}); std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) { + // Schema index of the current input column auto const schema_idx = _input_columns[i].schema_idx; - auto const& schema = _metadata->get_schema(schema_idx); - return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx)); + // Get the max_definition_level of this column across all sources. + auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1; + std::for_each(thrust::make_counting_iterator(static_cast(1)), + thrust::make_counting_iterator(_sources.size()), + [&](auto const src_file_idx) { + auto const& schema = _metadata->get_schema( + _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx); + max_definition_level = + std::max(max_definition_level, schema.max_definition_level + 1); + }); + + return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx)); }); // compute total # of page_nesting infos needed and allocate space. doing this in one @@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info() page_nesting_decode_info.device_ptr() + src_info_index; pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx]; + // Set the number of output nesting levels from the zeroth source as nesting must be + // identical across sources. pages[target_page_index + p_idx].num_output_nesting_levels = _metadata->get_output_nesting_depth(src_col_schema); @@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info() target_page_index += subpass.column_page_count[idx]; } + // Reset the target_page_index + target_page_index = 0; + // fill in int nesting_info_index = 0; - std::map, std::vector>> depth_remapping; for (size_t idx = 0; idx < _input_columns.size(); idx++) { auto const src_col_schema = _input_columns[idx].schema_idx; - // schema of the input column - auto& schema = _metadata->get_schema(src_col_schema); // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc) + // nesting depth must be same across sources so getting it from the zeroth source is ok int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema); + // Map to store depths if this column has lists + std::map, std::pair, std::vector>> depth_remapping; // if this column has lists, generate depth remapping - std::map, std::vector>> depth_remapping; - if (schema.max_repetition_level > 0) { - generate_depth_remappings(depth_remapping, src_col_schema, *_metadata); - } + std::for_each( + thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(_sources.size()), + [&](auto const src_file_idx) { + auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx); + if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) { + generate_depth_remappings( + depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata); + } + }); // fill in host-side nesting info - int schema_idx = src_col_schema; + int schema_idx = src_col_schema; + // This is okay as we only use this to check stubness of cur_schema and + // to get its parent's indices, both of which are one to one mapped. auto cur_schema = _metadata->get_schema(schema_idx); int cur_depth = max_output_depth - 1; while (schema_idx > 0) { @@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) { + // Source file index for the current page. + auto const src_file_idx = + pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx; PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])]; @@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info() &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])]; + auto const mapped_src_col_schema = + _metadata->map_schema_index(src_col_schema, src_file_idx); // if we have lists, set our start and end depth remappings - if (schema.max_repetition_level > 0) { - auto remap = depth_remapping.find(src_col_schema); + if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) { + auto remap = depth_remapping.find({src_col_schema, src_file_idx}); CUDF_EXPECTS(remap != depth_remapping.end(), "Could not find depth remapping for schema"); std::vector const& rep_depth_remap = (remap->second.first); @@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info() } } + // Get the schema from the current input source. + auto& actual_cur_schema = _metadata->get_schema( + _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx); + // values indexed by output column index - nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level; + nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level; pni[cur_depth].size = 0; pni[cur_depth].type = - to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id()); + to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id()); pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL; } @@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info() cur_schema = _metadata->get_schema(schema_idx); } + // Offset the page and nesting info indices + target_page_index += subpass.column_page_count[idx]; nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]); } diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 81fd4ab9f82..ec05f35d405 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const table_size = std::reduce(column_sizes.begin(), column_sizes.end()); auto const avg_row_len = util::div_rounding_up_safe(table_size, input.num_rows()); if (avg_row_len > 0) { - auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len); - max_page_fragment_size = std::min(rg_frag_size, max_page_fragment_size); + // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size + // value (=uint64::max) to avoid a sign overflow when comparing + auto const rg_frag_size = + std::min(std::numeric_limits::max(), + util::div_rounding_up_safe(max_row_group_size, avg_row_len)); + // Safe comparison as rg_frag_size fits in size_type + max_page_fragment_size = + std::min(static_cast(rg_frag_size), max_page_fragment_size); } // dividing page size by average row length will tend to overshoot the desired diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index bc2722441d0..734067582f7 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -67,6 +67,8 @@ struct parse_options_view { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::trie_view trie_true; cudf::detail::trie_view trie_false; cudf::detail::trie_view trie_na; @@ -85,6 +87,8 @@ struct parse_options { bool doublequote; bool dayfirst; bool skipblanklines; + bool normalize_whitespace; + bool mixed_types_as_string; cudf::detail::optional_trie trie_true; cudf::detail::optional_trie trie_false; cudf::detail::optional_trie trie_na; @@ -111,6 +115,8 @@ struct parse_options { doublequote, dayfirst, skipblanklines, + normalize_whitespace, + mixed_types_as_string, cudf::detail::make_trie_view(trie_true), cudf::detail::make_trie_view(trie_false), cudf::detail::make_trie_view(trie_na), diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 748691fb7d1..2ec23e0dc6d 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -377,16 +376,12 @@ conditional_inner_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::INNER_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::INNER_JOIN, output_size, stream, mr); } std::pair>, @@ -395,16 +390,12 @@ conditional_left_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::LEFT_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, stream, mr); } std::pair>, @@ -412,16 +403,12 @@ std::pair>, conditional_full_join(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join(left, - right, - binary_predicate, - detail::join_kind::FULL_JOIN, - {}, - cudf::get_default_stream(), - mr); + return detail::conditional_join( + left, right, binary_predicate, detail::join_kind::FULL_JOIN, {}, stream, mr); } std::unique_ptr> conditional_left_semi_join( @@ -429,16 +416,12 @@ std::unique_ptr> conditional_left_semi_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, stream, mr); } std::unique_ptr> conditional_left_anti_join( @@ -446,64 +429,56 @@ std::unique_ptr> conditional_left_anti_join( table_view const& right, ast::expression const& binary_predicate, std::optional output_size, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::conditional_join_anti_semi(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - output_size, - cudf::get_default_stream(), - mr); + return detail::conditional_join_anti_semi( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, stream, mr); } std::size_t conditional_inner_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::INNER_JOIN, stream, mr); } std::size_t conditional_left_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr); + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, stream, mr); } std::size_t conditional_left_semi_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, stream, mr); } std::size_t conditional_left_anti_join_size(table_view const& left, table_view const& right, ast::expression const& binary_predicate, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::compute_conditional_join_output_size(left, - right, - binary_predicate, - detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), - mr); + return detail::compute_conditional_join_output_size( + left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp index 4f6a9484e8c..303442e79ef 100644 --- a/cpp/src/join/conditional_join.hpp +++ b/cpp/src/join/conditional_join.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu index eeb49736bac..15594fb60e3 100644 --- a/cpp/src/join/cross_join.cu +++ b/cpp/src/join/cross_join.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -75,10 +74,11 @@ std::unique_ptr cross_join(cudf::table_view const& left, std::unique_ptr cross_join(cudf::table_view const& left, cudf::table_view const& right, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::cross_join(left, right, cudf::get_default_stream(), mr); + return detail::cross_join(left, right, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 0abff27667b..7b13c260364 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -120,10 +119,11 @@ std::pair>, inner_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::inner_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -131,10 +131,11 @@ std::pair>, left_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::left_join(left, right, compare_nulls, stream, mr); } std::pair>, @@ -142,10 +143,11 @@ std::pair>, full_join(table_view const& left, table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr); + return detail::full_join(left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 8ff78dd47f4..820b81ee309 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -484,6 +483,7 @@ mixed_inner_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -495,7 +495,7 @@ mixed_inner_join( compare_nulls, detail::join_kind::INNER_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -506,6 +506,7 @@ std::pair>> mixed_in table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -516,7 +517,7 @@ std::pair>> mixed_in binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -530,6 +531,7 @@ mixed_left_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -541,7 +543,7 @@ mixed_left_join( compare_nulls, detail::join_kind::LEFT_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } @@ -552,6 +554,7 @@ std::pair>> mixed_le table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -562,7 +565,7 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -576,6 +579,7 @@ mixed_full_join( ast::expression const& binary_predicate, null_equality compare_nulls, std::optional>> const output_size_data, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -587,7 +591,7 @@ mixed_full_join( compare_nulls, detail::join_kind::FULL_JOIN, output_size_data, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index cfb785e242c..aa4fa281159 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -267,6 +266,7 @@ std::unique_ptr> mixed_left_semi_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -277,7 +277,7 @@ std::unique_ptr> mixed_left_semi_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, - cudf::get_default_stream(), + stream, mr); } @@ -288,6 +288,7 @@ std::unique_ptr> mixed_left_anti_join( table_view const& right_conditional, ast::expression const& binary_predicate, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); @@ -298,7 +299,7 @@ std::unique_ptr> mixed_left_anti_join( binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index f69ded73e8d..d2ab2122c75 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -23,7 +23,6 @@ #include #include #include -#include #include #include @@ -98,22 +97,24 @@ std::unique_ptr> left_semi_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, stream, mr); } std::unique_ptr> left_anti_join( cudf::table_view const& left, cudf::table_view const& right, null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); + detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, stream, mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 76cd55bf994..0d017cf1f13 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -292,33 +292,32 @@ std::unique_ptr make_tdigest_column(size_type num_rows, return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr); } -std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr make_empty_tdigest_column(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { auto offsets = cudf::make_fixed_width_column( - data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr); + data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), offsets->mutable_view().begin(), offsets->mutable_view().end(), 0); - auto min_col = cudf::make_numeric_column( - data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); + auto min_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), min_col->mutable_view().begin(), min_col->mutable_view().end(), 0); - auto max_col = cudf::make_numeric_column( - data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr); + auto max_col = + cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr); thrust::fill(rmm::exec_policy(stream), max_col->mutable_view().begin(), max_col->mutable_view().end(), 0); - return make_tdigest_column(num_rows, - cudf::make_empty_column(type_id::FLOAT64), - cudf::make_empty_column(type_id::FLOAT64), + return make_tdigest_column(1, + make_empty_column(type_id::FLOAT64), + make_empty_column(type_id::FLOAT64), std::move(offsets), std::move(min_col), std::move(max_col), @@ -339,7 +338,7 @@ std::unique_ptr make_tdigest_column_of_empty_clusters(size_type num_rows std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release(); + auto contents = make_empty_tdigest_column(stream, mr)->release(); return std::make_unique( std::move(*std::make_unique
(std::move(contents.children))), true, stream, mr); } diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d591fb5c171..e1c1d2e3002 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -366,8 +366,8 @@ std::unique_ptr to_tdigest_scalar(std::unique_ptr&& tdigest, * @param group_cluster_wl Output. The set of cluster weight limits for each group. * @param group_num_clusters Output. The number of output clusters for each input group. * @param group_cluster_offsets Offsets per-group to the start of it's clusters - * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be - * set to false when there is no empty cluster, true otherwise. + * @param has_nulls Whether or not the input contains nulls + * */ template @@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, double* group_cluster_wl, size_type* group_num_clusters, size_type const* group_cluster_offsets, - bool may_have_empty_clusters) + bool has_nulls) { int const tid = threadIdx.x + blockIdx.x * blockDim.x; @@ -399,12 +399,11 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, // a group with nothing in it. group_num_clusters[group_index] = 0; if (total_weight <= 0) { - // If the input contains empty clusters, we can potentially have a group that also generates - // empty clusters because -all- of the input values are null or empty cluster. In that case, the - // `reduce_by_key` call in the tdigest generation step will need a location to store the unused - // reduction value for that group of nulls and empty clusters. These "stubs" will be - // postprocessed out afterwards. - if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; } + // if the input contains nulls we can potentially have a group that generates no + // clusters because -all- of the input values are null. in that case, the reduce_by_key call + // in the tdigest generation step will need a location to store the unused reduction value for + // that group of nulls. these "stubs" will be postprocessed out afterwards. + if (has_nulls) { group_num_clusters[group_index] = 1; } return; } @@ -503,8 +502,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta, * stream that falls before our current cluster limit * @param group_info A functor which returns the info for the specified group (total weight, * size and start offset) - * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be - * set to false only when there is no empty cluster. + * @param has_nulls Whether or not the input data contains nulls * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @@ -518,7 +516,7 @@ generate_group_cluster_info(int delta, NearestWeight nearest_weight, GroupInfo group_info, CumulativeWeight cumulative_weight, - bool may_have_empty_clusters, + bool has_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -537,7 +535,7 @@ generate_group_cluster_info(int delta, nullptr, group_num_clusters.begin(), nullptr, - may_have_empty_clusters); + has_nulls); // generate group cluster offsets (where the clusters for a given group start and end) auto group_cluster_offsets = cudf::make_numeric_column( @@ -569,7 +567,7 @@ generate_group_cluster_info(int delta, group_cluster_wl.begin(), group_num_clusters.begin(), group_cluster_offsets->view().begin(), - may_have_empty_clusters); + has_nulls); return {std::move(group_cluster_wl), std::move(group_cluster_offsets), @@ -582,7 +580,7 @@ std::unique_ptr build_output_column(size_type num_rows, std::unique_ptr&& offsets, std::unique_ptr&& min_col, std::unique_ptr&& max_col, - bool may_have_empty_clusters, + bool has_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -597,7 +595,7 @@ std::unique_ptr build_output_column(size_type num_rows, size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; }; size_type const num_stubs = [&]() { - if (!may_have_empty_clusters) { return 0; } + if (!has_nulls) { return 0; } auto iter = cudf::detail::make_counting_transform_iterator( 0, cuda::proclaim_return_type(is_stub_digest)); return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows); @@ -663,10 +661,6 @@ std::unique_ptr build_output_column(size_type num_rows, mr); } -/** - * @brief A functor which returns the cluster index within a group that the value at - * the given value index falls into. - */ template struct compute_tdigests_keys_fn { int const delta; @@ -712,8 +706,8 @@ struct compute_tdigests_keys_fn { * boundaries. * * @param delta tdigest compression level - * @param centroids_begin Beginning of the range of centroids. - * @param centroids_end End of the range of centroids. + * @param values_begin Beginning of the range of input values. + * @param values_end End of the range of input values. * @param cumulative_weight Functor which returns cumulative weight and group information for * an absolute input value index. * @param min_col Column containing the minimum value per group. @@ -721,8 +715,7 @@ struct compute_tdigests_keys_fn { * @param group_cluster_wl Cluster weight limits for each group. * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits. * @param total_clusters Total number of clusters in all groups. - * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be - * set to false only when there is no empty cluster. + * @param has_nulls Whether or not the input contains nulls * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @@ -738,7 +731,7 @@ std::unique_ptr compute_tdigests(int delta, rmm::device_uvector const& group_cluster_wl, std::unique_ptr&& group_cluster_offsets, size_type total_clusters, - bool may_have_empty_clusters, + bool has_nulls, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -757,9 +750,7 @@ std::unique_ptr compute_tdigests(int delta, // double // max // } // - if (total_clusters == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); - } + if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } // each input group represents an individual tdigest. within each tdigest, we want the keys // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall @@ -802,7 +793,7 @@ std::unique_ptr compute_tdigests(int delta, std::move(group_cluster_offsets), std::move(min_col), std::move(max_col), - may_have_empty_clusters, + has_nulls, stream, mr); } @@ -1030,6 +1021,76 @@ struct group_key_func { } }; +// merges all the tdigests within each group. returns a table containing 2 columns: +// the sorted means and weights. +template +std::pair, rmm::device_uvector> generate_merged_centroids( + tdigest_column_view const& tdv, + GroupOffsetIter group_offsets, + size_type num_groups, + rmm::cuda_stream_view stream) +{ + auto temp_mr = cudf::get_current_device_resource_ref(); + + auto const total_merged_centroids = tdv.means().size(); + + // output is the merged centroids (means, weights) + rmm::device_uvector output_means(total_merged_centroids, stream, temp_mr); + rmm::device_uvector output_weights(total_merged_centroids, stream, temp_mr); + + // each group represents a collection of tdigest columns. each row is 1 tdigest. + // within each group, we want to sort all the centroids within all the tdigests + // in that group, using the means as the key. the "outer offsets" represent the indices of the + // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest. + // + // rows + // ---- centroid 0 --------- + // tdigest 0 centroid 1 + // ---- centroid 2 group 0 + // tdigest 1 centroid 3 + // ---- centroid 4 --------- + // tdigest 2 centroid 5 + // ---- centroid 6 group 1 + // tdigest 3 centroid 7 + // centroid 8 + // ---- centroid 9 -------- + auto inner_offsets = tdv.centroids().offsets(); + auto centroid_offsets = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [group_offsets, inner_offsets = tdv.centroids().offsets().begin()] __device__( + size_type i) { return inner_offsets[group_offsets[i]]; })); + + // perform the sort using the means as the key + size_t temp_size; + CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(nullptr, + temp_size, + tdv.means().begin(), + output_means.begin(), + tdv.weights().begin(), + output_weights.begin(), + total_merged_centroids, + num_groups, + centroid_offsets, + centroid_offsets + 1, + stream.value())); + + rmm::device_buffer temp_mem(temp_size, stream, temp_mr); + CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(temp_mem.data(), + temp_size, + tdv.means().begin(), + output_means.begin(), + tdv.weights().begin(), + output_weights.begin(), + total_merged_centroids, + num_groups, + centroid_offsets, + centroid_offsets + 1, + stream.value())); + + return {std::move(output_means), std::move(output_weights)}; +} + template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, HGroupOffsetIter h_outer_offsets, @@ -1041,59 +1102,6 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - // thrust::merge and thrust::merge_by_key don't provide what we need. What we would need is an - // algorithm like a super-merge that takes two layers of keys: one which identifies the outer - // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the - // outer groups. - // TODO: investigate replacing the iterative merge with a single stable_sort_by_key. - - // bring tdigest offsets back to the host - auto tdigest_offsets = tdv.centroids().offsets(); - std::vector h_inner_offsets(tdigest_offsets.size()); - cudaMemcpyAsync(h_inner_offsets.data(), - tdigest_offsets.begin(), - sizeof(size_type) * tdigest_offsets.size(), - cudaMemcpyDefault, - stream); - - stream.synchronize(); - - // extract all means and weights into a table - cudf::table_view tdigests_unsliced({tdv.means(), tdv.weights()}); - - // generate the merged (but not yet compressed) tdigests for each group. - std::vector> tdigests; - tdigests.reserve(num_groups); - std::transform(h_outer_offsets, - h_outer_offsets + num_groups, - std::next(h_outer_offsets), - std::back_inserter(tdigests), - [&](auto tdigest_start, auto tdigest_end) { - // the range of tdigests in this group - auto const num_tdigests = tdigest_end - tdigest_start; - - // slice each tdigest from the input - std::vector unmerged_tdigests; - unmerged_tdigests.reserve(num_tdigests); - auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start); - std::transform( - offset_iter, - offset_iter + num_tdigests, - std::next(offset_iter), - std::back_inserter(unmerged_tdigests), - [&](size_type start, size_type end) { - return cudf::detail::slice(tdigests_unsliced, {start, end}, stream); - }); - - // merge - return cudf::detail::merge(unmerged_tdigests, - {0}, - {order::ASCENDING}, - {}, - stream, - cudf::get_current_device_resource_ref()); - }); - // generate min and max values auto merged_min_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); @@ -1130,7 +1138,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto group_num_weights = cudf::detail::make_counting_transform_iterator( 0, group_num_weights_func{group_offsets, - tdigest_offsets.begin()}); + tdv.centroids().offsets().begin()}); thrust::replace_if(rmm::exec_policy(stream), merged_min_col->mutable_view().begin(), merged_min_col->mutable_view().end(), @@ -1144,83 +1152,73 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, group_is_empty{}, 0); - // concatenate all the merged tdigests back into one table. - std::vector tdigest_views; - tdigest_views.reserve(num_groups); - std::transform(tdigests.begin(), - tdigests.end(), - std::back_inserter(tdigest_views), - [](std::unique_ptr
const& t) { return t->view(); }); - auto merged = - cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref()); - - auto merged_weights = merged->get_column(1).view(); - // If there are no values, we can simply return a column that has only empty tdigests. - if (merged_weights.size() == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr); - } + auto temp_mr = cudf::get_current_device_resource_ref(); + + // merge the centroids + auto [merged_means, merged_weights] = + generate_merged_centroids(tdv, group_offsets, num_groups, stream); + size_t const num_centroids = tdv.means().size(); + CUDF_EXPECTS(merged_means.size() == num_centroids, + "Unexpected number of centroids in merged result"); // generate cumulative weights - auto cumulative_weights = cudf::make_numeric_column( - data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream); - auto keys = cudf::detail::make_counting_transform_iterator( - 0, - group_key_func{ - group_labels, tdigest_offsets.begin(), tdigest_offsets.size()}); + rmm::device_uvector cumulative_weights(merged_weights.size(), stream, temp_mr); + + // generate group keys for all centroids in the entire column + rmm::device_uvector group_keys(num_centroids, stream, temp_mr); + auto iter = thrust::make_counting_iterator(0); + auto inner_offsets = tdv.centroids().offsets(); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_centroids, + group_keys.begin(), + group_key_func{ + group_labels, inner_offsets.begin(), inner_offsets.size()}); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - keys, - keys + cumulative_weights->size(), - merged_weights.begin(), - cumulative_weights->mutable_view().begin()); + group_keys.begin(), + group_keys.begin() + num_centroids, + merged_weights.begin(), + cumulative_weights.begin()); auto const delta = max_centroids; - // We do not know whether there is any empty cluster in the input without actually reading the - // data, which could be expensive. So, we just assume that there could be empty clusters. - auto const may_have_empty_clusters = true; - // generate cluster info auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info( delta, num_groups, nearest_value_centroid_weights{ - cumulative_weights->view().begin(), - group_offsets, - tdigest_offsets.begin()}, - centroid_group_info{cumulative_weights->view().begin(), - group_offsets, - tdigest_offsets.begin()}, + cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + centroid_group_info{ + cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, cumulative_centroid_weight{ - cumulative_weights->view().begin(), + cumulative_weights.begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, - may_have_empty_clusters, + {inner_offsets.begin(), static_cast(inner_offsets.size())}}, + false, stream, mr); // input centroid values auto centroids = cudf::detail::make_counting_transform_iterator( - 0, - make_weighted_centroid{merged->get_column(0).view().begin(), - merged_weights.begin()}); + 0, make_weighted_centroid{merged_means.begin(), merged_weights.begin()}); // compute the tdigest return compute_tdigests( delta, centroids, - centroids + merged->num_rows(), + centroids + merged_means.size(), cumulative_centroid_weight{ - cumulative_weights->view().begin(), + cumulative_weights.begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + {inner_offsets.begin(), static_cast(inner_offsets.size())}}, std::move(merged_min_col), std::move(merged_max_col), group_cluster_wl, std::move(group_cluster_offsets), total_clusters, - may_have_empty_clusters, + false, stream, mr); } @@ -1285,9 +1283,7 @@ std::unique_ptr group_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - if (col.size() == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); - } + if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } auto const delta = max_centroids; return cudf::type_dispatcher(col.type(), @@ -1313,7 +1309,7 @@ std::unique_ptr group_merge_tdigest(column_view const& input, tdigest_column_view tdv(input); if (num_groups == 0 || input.size() == 0) { - return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr); + return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } // bring group offsets back to the host diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index adf650a4f27..7c4c89bd3fb 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -539,15 +539,26 @@ class regex_parser { : static_cast(LBRA); case ')': return RBRA; case '^': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return BOL; } case '$': { - _chr = is_multiline(_flags) ? chr : '\n'; + if (is_ext_newline(_flags)) { + _chr = is_multiline(_flags) ? 'S' : 'N'; + } else { + _chr = is_multiline(_flags) ? chr : '\n'; + } return EOL; } case '[': return build_cclass(); - case '.': return dot_type; + case '.': { + _chr = is_ext_newline(_flags) ? 'N' : chr; + return dot_type; + } } if (std::find(quantifiers.begin(), quantifiers.end(), static_cast(chr)) == @@ -959,7 +970,7 @@ class regex_compiler { _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { _prog.inst_at(inst_id).u1.c = yy; - } else if (token == BOL || token == EOL) { + } else if (token == BOL || token == EOL || token == ANY) { _prog.inst_at(inst_id).u1.c = yy; } push_and(inst_id, inst_id); @@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags) case STAR: printf(" STAR next=%d", inst.u2.next_id); break; case PLUS: printf(" PLUS next=%d", inst.u2.next_id); break; case QUEST: printf(" QUEST next=%d", inst.u2.next_id); break; - case ANY: printf(" ANY next=%d", inst.u2.next_id); break; + case ANY: printf(" ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break; case ANYNL: printf(" ANYNL next=%d", inst.u2.next_id); break; case NOP: printf(" NOP next=%d", inst.u2.next_id); break; case BOL: { diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 3b899e4edc1..e34a1e12015 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist() list2 = tmp; } +/** + * @brief Check for supported new-line characters + * + * '\n, \r, \u0085, \u2028, or \u2029' + */ +constexpr bool is_newline(char32_t const ch) +{ + return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9); +} + /** * @brief Utility to check a specific character against this class instance. * @@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const if (checkstart) { auto startchar = static_cast(jnk.startchar); switch (jnk.starttype) { - case BOL: - if (pos == 0) break; - if (jnk.startchar != '^') { return cuda::std::nullopt; } + case BOL: { + if (pos == 0) { break; } + if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; } + if (startchar != '\n') { break; } --itr; startchar = static_cast('\n'); + [[fallthrough]]; + } case CHAR: { auto const find_itr = find_char(startchar, dstr, itr); if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; } @@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const id_activate = inst.u2.next_id; expanded = true; break; - case BOL: - if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) { + case BOL: { + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; + if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) || + ((inst.u1.c == 'S') && (is_newline(prev_c)))) { id_activate = inst.u2.next_id; expanded = true; } break; - case EOL: + } + case EOL: { // after the last character OR: // - for MULTILINE, if current character is new-line // - for non-MULTILINE, the very last character of the string can also be a new-line + bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n'); if (last_character || - ((c == '\n') && (inst.u1.c != 'Z') && - ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) { + (nl && (inst.u1.c != 'Z') && + ((inst.u1.c == '$' || inst.u1.c == 'S') || + (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) { id_activate = inst.u2.next_id; expanded = true; } break; + } case BOW: case NBOW: { - auto const prev_c = pos > 0 ? dstr[pos - 1] : 0; + auto titr = itr; + auto const prev_c = pos > 0 ? *(--titr) : 0; auto const word_class = reclass_device{CCLASS_W}; bool const curr_is_word = word_class.is_match(c, _codepoint_flags); bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags); @@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const case CHAR: if (inst.u1.c == c) id_activate = inst.u2.next_id; break; - case ANY: - if (c != '\n') id_activate = inst.u2.next_id; - break; + case ANY: { + if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; } + [[fallthrough]]; + } case ANYNL: id_activate = inst.u2.next_id; break; case NCCLASS: case CCLASS: { diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 605582f28a6..a03a34f5fa7 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -151,15 +153,111 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, mr); auto d_hashes = hashes->mutable_view().data(); - constexpr int block_size = 256; - cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size}; + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; minhash_kernel<<>>( *d_strings, seeds, width, d_hashes); return hashes; } -std::unique_ptr build_list_result(cudf::strings_column_view const& input, +/** + * @brief Compute the minhash of each list row of strings for each seed + * + * This is a warp-per-row algorithm where parallel threads within a warp + * work on strings in a single list row. + * + * @tparam HashFunction hash function to use on each string + * + * @param d_input List of strings to process + * @param seeds Seeds for hashing each string + * @param d_hashes Minhash output values (one per row) + */ +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input, + cudf::device_span seeds, + hash_value_type* d_hashes) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const row_idx = idx / cudf::detail::warp_size; + + if (row_idx >= d_input.size()) { return; } + if (d_input.is_null(row_idx)) { return; } + + auto const d_row = cudf::list_device_view(d_input, row_idx); + auto const d_output = d_hashes + (row_idx * seeds.size()); + + // initialize hashes output for this row + auto const lane_idx = static_cast(idx % cudf::detail::warp_size); + if (lane_idx == 0) { + auto const init = d_row.size() == 0 ? 0 : std::numeric_limits::max(); + thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init); + } + __syncwarp(); + + // each lane hashes a string from the input row + for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) { + auto const hash_str = + d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element(str_idx); + for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { + auto const hasher = HashFunction(seeds[seed_idx]); + // hash string and store the min value + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values + // but only uses the first uint64 value as requested by the LLM team. + hv = thrust::get<0>(hasher(hash_str)); + } + cuda::atomic_ref ref{*(d_output + seed_idx)}; + ref.fetch_min(hv, cuda::std::memory_order_relaxed); + } + } +} + +template < + typename HashFunction, + typename hash_value_type = std:: + conditional_t, uint32_t, uint64_t>> +std::unique_ptr word_minhash_fn(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument); + CUDF_EXPECTS((static_cast(input.size()) * seeds.size()) < + static_cast(std::numeric_limits::max()), + "The number of seeds times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_input = cudf::column_device_view::create(input.parent(), stream); + + auto hashes = cudf::make_numeric_column(output_type, + input.size() * static_cast(seeds.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_hashes = hashes->mutable_view().data(); + auto lcdv = cudf::detail::lists_column_device_view(*d_input); + + constexpr cudf::thread_index_type block_size = 256; + cudf::detail::grid_1d grid{ + static_cast(input.size()) * cudf::detail::warp_size, block_size}; + minhash_word_kernel + <<>>(lcdv, seeds, d_hashes); + + return hashes; +} + +std::unique_ptr build_list_result(cudf::column_view const& input, std::unique_ptr&& hashes, cudf::size_type seeds_size, rmm::cuda_stream_view stream, @@ -176,7 +274,7 @@ std::unique_ptr build_list_result(cudf::strings_column_view const& std::move(offsets), std::move(hashes), input.null_count(), - cudf::detail::copy_bitmask(input.parent(), stream, mr), + cudf::detail::copy_bitmask(input, stream, mr), stream, mr); // expect this condition to be very rare @@ -208,7 +306,7 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } std::unique_ptr minhash64(cudf::strings_column_view const& input, @@ -232,7 +330,27 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, { using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; auto hashes = detail::minhash_fn(input, seeds, width, stream, mr); - return build_list_result(input, std::move(hashes), seeds.size(), stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = detail::word_minhash_fn(input, seeds, stream, mr); + return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } } // namespace detail @@ -276,4 +394,21 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr word_minhash(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash(input, seeds, stream, mr); +} + +std::unique_ptr word_minhash64(cudf::lists_column_view const& input, + cudf::device_span seeds, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::word_minhash64(input, seeds, stream, mr); +} } // namespace nvtext diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index 9d3a7ce5a4e..9824c472b20 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -132,6 +132,13 @@ struct cuda_event { cuda_event() { CUDF_CUDA_TRY(cudaEventCreateWithFlags(&e_, cudaEventDisableTiming)); } virtual ~cuda_event() { CUDF_ASSERT_CUDA_SUCCESS(cudaEventDestroy(e_)); } + // Moveable but not copyable. + cuda_event(const cuda_event&) = delete; + cuda_event& operator=(const cuda_event&) = delete; + + cuda_event(cuda_event&&) = default; + cuda_event& operator=(cuda_event&&) = default; + operator cudaEvent_t() { return e_; } private: @@ -147,11 +154,12 @@ struct cuda_event { */ cudaEvent_t event_for_thread() { - thread_local std::vector> thread_events(get_num_cuda_devices()); + // The program may crash if this function is called from the main thread and user application + // subsequently calls cudaDeviceReset(). + // As a workaround, here we intentionally disable RAII and leak cudaEvent_t. + thread_local std::vector thread_events(get_num_cuda_devices()); auto const device_id = get_current_cuda_device(); - if (not thread_events[device_id.value()]) { - thread_events[device_id.value()] = std::make_unique(); - } + if (not thread_events[device_id.value()]) { thread_events[device_id.value()] = new cuda_event(); } return *thread_events[device_id.value()]; } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 1bedb344a01..288fa84a73d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -687,10 +687,12 @@ ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_DATETIME_TEST streams/datetime_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_JOIN_TEST streams/join_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu index 3780dbb1d95..baa59026b07 100644 --- a/cpp/tests/groupby/tdigest_tests.cu +++ b/cpp/tests/groupby/tdigest_tests.cu @@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups) cudf::test::fixed_width_column_wrapper keys{0, 0, 0, 0, 0, 0, 0}; int const delta = 1000; - auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto a = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto b = cudf::type_dispatcher( static_cast(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta); - auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto c = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); auto d = cudf::type_dispatcher( static_cast(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta); - auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto e = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); std::vector cols; cols.push_back(*a); @@ -507,81 +507,3 @@ TEST_F(TDigestMergeTest, EmptyGroups) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]); } - -std::unique_ptr do_agg( - cudf::column_view key, - cudf::column_view val, - std::function()> make_agg) -{ - std::vector keys; - keys.push_back(key); - cudf::table_view const key_table(keys); - - cudf::groupby::groupby gb(key_table); - std::vector requests; - cudf::groupby::aggregation_request req; - req.values = val; - req.aggregations.push_back(make_agg()); - requests.push_back(std::move(req)); - - auto result = gb.aggregate(std::move(requests)); - - std::vector> result_columns; - for (auto&& c : result.first->release()) { - result_columns.push_back(std::move(c)); - } - - EXPECT_EQ(result.second.size(), 1); - EXPECT_EQ(result.second[0].results.size(), 1); - result_columns.push_back(std::move(result.second[0].results[0])); - - return std::make_unique(std::move(result_columns)); -} - -TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters) -{ - // The input must be sorted by the key. - // See `aggregate_result_functor::operator()` for details. - auto const keys = cudf::test::fixed_width_column_wrapper{{0, 0, 1, 1, 2}}; - auto const keys_view = cudf::column_view(keys); - auto val_elems = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); - auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { - // All values are null - return false; - }); - auto const vals = cudf::test::fixed_width_column_wrapper{ - val_elems, val_elems + keys_view.size(), val_valids}; - - auto const delta = 10000; - - // Compute tdigest. The result should have 3 empty clusters, one per group. - auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() { - return cudf::make_tdigest_aggregation(delta); - }); - - auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; - cudf::column_view const expected_computed_keys_view{expected_computed_keys}; - auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - expected_computed_keys_view.size(), - cudf::get_default_stream(), - rmm::mr::get_current_device_resource()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view()); - // The computed values are nullable even though the input values are not. - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(), - compute_result->get_column(1).view()); - - // Merge tdigest. The result should have 3 empty clusters, one per group. - auto const merge_result = - do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() { - return cudf::make_merge_tdigest_aggregation(delta); - }); - - auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper{{0, 1, 2}}; - cudf::column_view const expected_merged_keys_view{expected_merged_keys}; - auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - expected_merged_keys_view.size(), - cudf::get_default_stream(), - rmm::mr::get_current_device_resource()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view()); -} diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 960c19fce2e..48bc982d0e3 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2856,4 +2856,47 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) } } +TEST_F(JsonReaderTest, JsonDtypeSchema) +{ + std::string data = R"( + {"a": 1, "b": {"0": "abc", "1": ["a", "b"]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {"0": "lolol "}, "c": true} + )"; + + std::map dtype_schema{{"c", {data_type{type_id::STRING}}}, + {"b", {data_type{type_id::STRING}}}, + {"a", {dtype()}}}; + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(dtype_schema) + .prune_columns(true) + .lines(true); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 3); + EXPECT_EQ(result.tbl->num_rows(), 3); + + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRING); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING); + + EXPECT_EQ(result.metadata.schema_info[0].name, "a"); + EXPECT_EQ(result.metadata.schema_info[1].name, "b"); + EXPECT_EQ(result.metadata.schema_info[2].name, "c"); + + // cudf::column::contents contents = result.tbl->get_column(1).release(); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), float64_wrapper{{1, 1, 1}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + result.tbl->get_column(1), + cudf::test::strings_column_wrapper({"{\"0\": \"abc\", \"1\": [\"a\", \"b\"]}", + "{\"0\": \"abc\" }", + "{\"0\": \"lolol \"}"}), + cudf::test::debug_output_level::ALL_ERRORS); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), + cudf::test::strings_column_wrapper({"true", "false", "true"}), + cudf::test::debug_output_level::ALL_ERRORS); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu index 6d79fdc98ef..6a3bd69de81 100644 --- a/cpp/tests/io/json/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu @@ -34,129 +34,127 @@ // Base test fixture for tests struct JsonWSNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) -{ - // Prepare cuda stream for data transfers & kernels - auto stream_view = cudf::test::get_default_stream(); - - auto device_input = rmm::device_buffer( - host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref()); - - // Preprocessing FST - cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_whitespace( - device_data, stream_view, cudf::get_current_device_resource_ref()); - - std::string preprocessed_host_output(device_data.size(), 0); - CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), - device_data.data(), - preprocessed_host_output.size(), - cudaMemcpyDeviceToHost, - stream_view.value())); - - stream_view.synchronize(); - ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL( - preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); -} - -TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces) +TEST_F(JsonWSNormalizationTest, ReadJsonOption) { - std::string input = R"({ "A" : "TEST" })"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MoreSpaces) -{ - std::string input = R"({"a": [1, 2, 3, 4, 5, 6, 7, 8], "b": {"c": "d"}})"; - std::string output = R"({"a":[1,2,3,4,5,6,7,8],"b":{"c":"d"}})"; - run_test(input, output); -} + // Test input + std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true); -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesInString) -{ - std::string input = R"({" a ":50})"; - std::string output = R"({" a ":50})"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NewlineInString) -{ - std::string input = "{\"a\" : \"x\ny\"}\n{\"a\" : \"x\\ny\"}"; - std::string output = "{\"a\":\"x\ny\"}\n{\"a\":\"x\\ny\"}"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"({ "a" : {"b":"c"}})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false); -TEST_F(JsonWSNormalizationTest, GroundTruth_Tabs) -{ - std::string input = "{\"a\":\t\"b\"}"; - std::string output = R"({"a":"b"})"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, GroundTruth_SpacesAndTabs) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows) { - std::string input = "{\"A\" : \t\"TEST\" }"; - std::string output = R"({"A":"TEST"})"; - run_test(input, output); -} + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace -TEST_F(JsonWSNormalizationTest, GroundTruth_MultilineJSONWithSpacesAndTabs) -{ - std::string input = - "{ \"foo rapids\": [1,2,3], \"bar\trapids\": 123 }\n\t{ \"foo rapids\": { \"a\": 1 }, " - "\"bar\trapids\": 456 }"; - std::string output = - "{\"foo rapids\":[1,2,3],\"bar\trapids\":123}\n{\"foo rapids\":{\"a\":1},\"bar\trapids\":456}"; - run_test(input, output); -} + // Test input + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" } ] } } + )"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_PureJSONExample) -{ - std::string input = R"([{"a":50}, {"a" : 60}])"; - std::string output = R"([{"a":50},{"a":60}])"; - run_test(input, output); -} + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); -TEST_F(JsonWSNormalizationTest, GroundTruth_NoNormalizationRequired) -{ - std::string input = R"({"a\\n\r\a":50})"; - std::string output = R"({"a\\n\r\a":50})"; - run_test(input, output); -} + // Expected table + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": "abc" } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"}] } } + )"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); -TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput) -{ - std::string input = "{\"a\" : \"b }\n{ \"c \" :\t\"d\"}"; - std::string output = "{\"a\":\"b }\n{\"c \":\"d\"}"; - run_test(input, output); + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); } -TEST_F(JsonWSNormalizationTest, ReadJsonOption) +TEST_F(JsonWSNormalizationTest, ReadJsonOption_InvalidRows_NoMixedType) { // When mixed type fields are read as strings, the table read will differ depending the // value of normalize_whitespace // Test input - std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + std::string const host_input = R"( + { "Root": { "Key": [ { "EE": tr ue } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{ "YY": 1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [ { "EE": "efg" }, { "YY" : "abc" } ] } } + { "Root": { "Key": [ { "YY" : "abc" } ] } } + )"; + + std::map dtype_schema{ + {"Key", {cudf::data_type{cudf::type_id::STRING}}}}; + cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(true); + .prune_columns(true) + .normalize_whitespace(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); // Expected table - std::string const expected_input = R"({ "a" : {"b":"c"}})"; + std::string const expected_input = R"( + { "Root": { "Key": [ { "EE": tr ue } , { "YY" : 2 } ] } } + { "Root": { "Key": [ { "EE": 12 34 } ] } } + { "Root": { "Key": [{"YY":1}] } } + { "Root": { "Key": [ { "EE": 12. 34 } ] } } + { "Root": { "Key": [{"EE":"efg"},{"YY":"abc"}] } } + { "Root": { "Key": [{"YY":"abc"}] } } + )"; + cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) + .dtypes(dtype_schema) .lines(true) - .mixed_types_as_string(true) - .normalize_whitespace(false); + .prune_columns(true) + .normalize_whitespace(false) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index ab387a5c7f5..3431e941359 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -39,6 +39,8 @@ #include #include +#include + #include template @@ -60,6 +62,7 @@ template >, cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr), cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK> std::unique_ptr join_and_gather( @@ -68,12 +71,13 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); auto const [left_join_indices, right_join_indices] = - join_impl(left_selected, right_selected, compare_nulls, mr); + join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*left_join_indices}; auto right_indices_span = cudf::device_span{*right_join_indices}; @@ -2027,7 +2031,11 @@ struct JoinTestLists : public cudf::test::BaseFixture { auto const probe_tv = cudf::table_view{{probe}}; auto const [left_result_map, right_result_map] = - join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref()); + join_func(build_tv, + probe_tv, + nulls_equal, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); auto const left_result_table = sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy); diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp index 3e279260b99..554d5754e39 100644 --- a/cpp/tests/join/semi_anti_join_tests.cpp +++ b/cpp/tests/join/semi_anti_join_tests.cpp @@ -28,8 +28,11 @@ #include #include #include +#include #include +#include + #include template @@ -51,6 +54,7 @@ template > (*join_impl)( cudf::table_view const& left_keys, cudf::table_view const& right_keys, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)> std::unique_ptr join_and_gather( cudf::table_view const& left_input, @@ -58,11 +62,12 @@ std::unique_ptr join_and_gather( std::vector const& left_on, std::vector const& right_on, cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) { auto left_selected = left_input.select(left_on); auto right_selected = right_input.select(right_on); - auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr); + auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, stream, mr); auto left_indices_span = cudf::device_span{*join_indices}; auto left_indices_col = cudf::column_view{left_indices_span}; diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp index 7359f0406fc..915717713df 100644 --- a/cpp/tests/quantiles/percentile_approx_test.cpp +++ b/cpp/tests/quantiles/percentile_approx_test.cpp @@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {}; TEST_F(PercentileApproxTest, EmptyInput) { - auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters( - 1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column( + cudf::get_default_stream(), cudf::get_current_device_resource_ref()); cudf::test::fixed_width_column_wrapper percentiles{0.0, 0.25, 0.3}; std::vector input; diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp new file mode 100644 index 00000000000..82629156fa6 --- /dev/null +++ b/cpp/tests/streams/datetime_test.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +class DatetimeTest : public cudf::test::BaseFixture { + public: + cudf::test::fixed_width_column_wrapper timestamps{ + -23324234, // 1969-12-31 23:59:59.976675766 GMT + 23432424, // 1970-01-01 00:00:00.023432424 GMT + 987234623 // 1970-01-01 00:00:00.987234623 GMT + }; + cudf::test::fixed_width_column_wrapper months{{1, -1, 3}}; +}; + +TEST_F(DatetimeTest, ExtractYear) +{ + cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMonth) +{ + cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractDay) +{ + cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractWeekday) +{ + cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractHour) +{ + cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMinute) +{ + cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractSecond) +{ + cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMillisecondFraction) +{ + cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractMicrosecondFraction) +{ + cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractNanosecondFraction) +{ + cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, LastDayOfMonth) +{ + cudf::datetime::last_day_of_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DayOfYear) +{ + cudf::datetime::day_of_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonths) +{ + cudf::datetime::add_calendrical_months(timestamps, months, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, AddCalendricalMonthsScalar) +{ + auto scalar = cudf::make_fixed_width_scalar(1, cudf::test::get_default_stream()); + + cudf::datetime::add_calendrical_months(timestamps, *scalar, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, IsLeapYear) +{ + cudf::datetime::is_leap_year(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, DaysInMonth) +{ + cudf::datetime::days_in_month(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, ExtractQuarter) +{ + cudf::datetime::extract_quarter(timestamps, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, CeilDatetimes) +{ + cudf::datetime::ceil_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, FloorDatetimes) +{ + cudf::datetime::floor_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} + +TEST_F(DatetimeTest, RoundDatetimes) +{ + cudf::datetime::round_datetimes( + timestamps, cudf::datetime::rounding_frequency::HOUR, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/streams/join_test.cpp b/cpp/tests/streams/join_test.cpp new file mode 100644 index 00000000000..2811bb676fa --- /dev/null +++ b/cpp/tests/streams/join_test.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +class JoinTest : public cudf::test::BaseFixture { + static inline cudf::table make_table() + { + cudf::test::fixed_width_column_wrapper col0{{3, 1, 2, 0, 3}}; + cudf::test::strings_column_wrapper col1{{"s0", "s1", "s2", "s4", "s1"}}; + cudf::test::fixed_width_column_wrapper col2{{0, 1, 2, 4, 1}}; + + std::vector> columns; + columns.push_back(col0.release()); + columns.push_back(col1.release()); + columns.push_back(col2.release()); + + return cudf::table{std::move(columns)}; + } + + public: + cudf::table table0{make_table()}; + cudf::table table1{make_table()}; + cudf::table conditional0{make_table()}; + cudf::table conditional1{make_table()}; + cudf::ast::column_reference col_ref_left_0{0}; + cudf::ast::column_reference col_ref_right_0{0, cudf::ast::table_reference::RIGHT}; + cudf::ast::operation left_zero_eq_right_zero{ + cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0}; +}; + +TEST_F(JoinTest, InnerJoin) +{ + cudf::inner_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftJoin) +{ + cudf::left_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, FullJoin) +{ + cudf::full_join(table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftSemiJoin) +{ + cudf::left_semi_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, LeftAntiJoin) +{ + cudf::left_anti_join( + table0, table1, cudf::null_equality::EQUAL, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, CrossJoin) { cudf::cross_join(table0, table1, cudf::test::get_default_stream()); } + +TEST_F(JoinTest, ConditionalInnerJoin) +{ + cudf::conditional_inner_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoin) +{ + cudf::conditional_left_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalFullJoin) +{ + cudf::conditional_full_join( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoin) +{ + cudf::conditional_left_semi_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoin) +{ + cudf::conditional_left_anti_join( + table0, table1, left_zero_eq_right_zero, std::nullopt, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoin) +{ + cudf::mixed_inner_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoin) +{ + cudf::mixed_left_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedFullJoin) +{ + cudf::mixed_full_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + std::nullopt, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftSemiJoin) +{ + cudf::mixed_left_semi_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftAntiJoin) +{ + cudf::mixed_left_anti_join(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedInnerJoinSize) +{ + cudf::mixed_inner_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, MixedLeftJoinSize) +{ + cudf::mixed_left_join_size(table0, + table1, + conditional0, + conditional1, + left_zero_eq_right_zero, + cudf::null_equality::EQUAL, + cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalInnerJoinSize) +{ + cudf::conditional_inner_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftJoinSize) +{ + cudf::conditional_left_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftSemiJoinSize) +{ + cudf::conditional_left_semi_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} + +TEST_F(JoinTest, ConditionalLeftAntiJoinSize) +{ + cudf::conditional_left_anti_join_size( + table0, table1, left_zero_eq_right_zero, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index c816316d0ff..acf850c7a66 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } +TEST_F(StringsContainsTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé", + "qqq\rzzé" LINE_SEPARATOR "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\nzzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto ml_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, ml_flags); + + auto expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + auto results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1, 1}); + results = cudf::strings::contains_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + expected = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + expected = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1, 0}); + results = cudf::strings::matches_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto counts = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1, 0}); + results = cudf::strings::count_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + counts = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1, 1}); + results = cudf::strings::count_re(view, *prog_ml); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts); + + pattern = std::string("q.*l"); + prog = cudf::strings::regex_program::create(pattern); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // inst ANY will stop matching on first 'newline' and so should not match anything here + prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + // including the DOTALL flag accepts the newline characters + auto dot_flags = static_cast(cudf::strings::regex_flags::EXT_NEWLINE | + cudf::strings::regex_flags::DOTALL); + prog = cudf::strings::regex_program::create(pattern, dot_flags); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 0, 0, 0, 0}); + results = cudf::strings::contains_re(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, EndOfString) { auto input = cudf::test::strings_column_wrapper( diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index b26cbd5a549..1491da758d5 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -14,9 +14,12 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include +#include #include #include @@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected); } +TEST_F(StringsExtractTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé", + "qqq" LINE_SEPARATOR "zzé\rlll", + "zzé", + "", + "zzé" NEXT_LINE, + "abc" PARAGRAPH_SEPARATOR "zzé\n"}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::extract(view, *prog); + auto expected = + cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::extract(view, *prog_ml); + expected = + cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + + prog = cudf::strings::regex_program::create("q(q.*l)l"); + expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""}, + {0, 1, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); + // expect no matches here since the newline(s) interrupts the pattern + prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE); + expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0}); + results = cudf::strings::extract(view, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 4582dcb1e38..47606b9b3ed 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé", + "qqq\nzzé" PARAGRAPH_SEPARATOR "lll", + "zzé", + "", + "zzé\r", + "zzé" LINE_SEPARATOR "zzé" NEXT_LINE}); + auto view = cudf::strings_column_view(input); + + auto prog = + cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::findall(view, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags); + results = cudf::strings::findall(view, *prog_ml); + LCW expected_ml( + {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml); +} + TEST_F(StringsFindallTests, MediumRegex) { // This results in 15 regex instructions and falls in the 'medium' range. diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 8c0482653fb..9847d8d6bb5 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "special_chars.h" + #include #include #include @@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); } +TEST_F(StringsReplaceRegexTest, SpecialNewLines) +{ + auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "zzé", + "", + "zzé" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + auto view = cudf::strings_column_view(input); + auto repl = cudf::string_scalar("_"); + auto pattern = std::string("^zzé$"); + auto prog = + cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE); + auto results = cudf::strings::replace_re(view, *prog, repl); + auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé", + "qqq" NEXT_LINE "zzé" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\rzzé\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto both_flags = static_cast( + cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE); + auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_re(view, *prog_ml, repl); + expected = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_", + "qqq" NEXT_LINE "_" NEXT_LINE "lll", + "_", + "", + "_" PARAGRAPH_SEPARATOR, + "abc\r_\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + auto repl_template = std::string("[\\1]"); + pattern = std::string("(^zzé$)"); + prog = cudf::strings::regex_program::create(pattern, both_flags); + results = cudf::strings::replace_with_backrefs(view, *prog, repl_template); + expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]", + "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll", + "[zzé]", + "", + "[zzé]" PARAGRAPH_SEPARATOR, + "abc\r[zzé]\r"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h new file mode 100644 index 00000000000..0d630f6bb52 --- /dev/null +++ b/cpp/tests/strings/special_chars.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +namespace cudf::test { + +// special new-line characters for use with regex_flags::EXT_NEWLINE +#define NEXT_LINE "\xC2\x85" +#define LINE_SEPARATOR "\xE2\x80\xA8" +#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9" + +} // namespace cudf::test diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index 7575a3ba846..e23f3f6e7d8 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } +TEST_F(MinHashTest, WordsMinHash) +{ + using LCWS = cudf::test::lists_column_wrapper; + auto validity = cudf::test::iterators::null_at(1); + + LCWS input( + {LCWS({"hello", "abcdéfgh"}), + LCWS{}, + LCWS({"rapids", "moré", "test", "text"}), + LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, + validity); + + auto view = cudf::lists_column_view(input); + + auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); + auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); + using LCW32 = cudf::test::lists_column_wrapper; + LCW32 expected({LCW32{2069617641u, 1975382903u}, + LCW32{}, + LCW32{657297235u, 1010955999u}, + LCW32{644643885u, 310002789u}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); + auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); + using LCW64 = cudf::test::lists_column_wrapper; + LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, + LCW64{}, + LCW64{5331949571924938590ul, 2088583894581919741ul}, + LCW64{3400468157617183341ul, 2398577492366130055ul}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); +} + TEST_F(MinHashTest, EmptyTest) { auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); diff --git a/dependencies.yaml b/dependencies.yaml index 483335c02ff..339adbc5ff9 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -43,15 +43,28 @@ files: includes: - cuda_version - test_cpp - test_python: + test_python_cudf_pandas: output: none includes: - cuda_version - py_version - test_python_common - test_python_cudf - - test_python_dask_cudf - test_python_cudf_pandas + test_python_cudf: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_cudf + test_python_other: + output: none + includes: + - cuda_version + - py_version + - test_python_common + - test_python_dask_cudf test_java: output: none includes: @@ -350,12 +363,12 @@ dependencies: common: - output_types: conda packages: - - fmt>=10.1.1,<11 + - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 # Align nvcomp version with rapids-cmake - nvcomp==4.0.1 - - spdlog>=1.12.0,<1.13 + - spdlog>=1.14.1,<1.15 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] @@ -650,7 +663,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.0,<1.3 + - polars>=1.8,<1.9 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] @@ -707,10 +720,17 @@ dependencies: - matrix: {dependencies: "oldest"} packages: - numba==0.57.* - - numpy==1.23.* - pandas==2.0.* - - pyarrow==14.0.0 - - cupy==12.0.0 # ignored as pip constraint + - matrix: + packages: + - output_types: conda + matrices: + - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"} + packages: + - cupy==12.2.0 # cupy 12.2.0 is the earliest with CUDA 12 ARM packages. + - matrix: {dependencies: "oldest"} + packages: + - cupy==12.0.0 - matrix: packages: - output_types: requirements @@ -755,6 +775,14 @@ dependencies: - &transformers transformers==4.39.3 - tzdata specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.23.* + - pyarrow==14.0.0 + - matrix: + packages: - output_types: conda matrices: - matrix: @@ -774,6 +802,15 @@ dependencies: packages: - dask-cuda==24.10.*,>=0.0.0a0 - *numba + specific: + - output_types: [conda, requirements] + matrices: + - matrix: {dependencies: "oldest"} + packages: + - numpy==1.24.* + - pyarrow==14.0.1 + - matrix: + packages: depends_on_libcudf: common: - output_types: conda diff --git a/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png new file mode 100644 index 00000000000..e472cf66612 Binary files /dev/null and b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png differ diff --git a/docs/cudf/source/_static/compute_heavy_queries_polars.png b/docs/cudf/source/_static/compute_heavy_queries_polars.png new file mode 100644 index 00000000000..6854ed5a436 Binary files /dev/null and b/docs/cudf/source/_static/compute_heavy_queries_polars.png differ diff --git a/docs/cudf/source/_static/pds_benchmark_polars.png b/docs/cudf/source/_static/pds_benchmark_polars.png new file mode 100644 index 00000000000..d0b48ab2901 Binary files /dev/null and b/docs/cudf/source/_static/pds_benchmark_polars.png differ diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index 0398a8d7086..41838e01dd9 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the ```bash python -m cudf.pandas --profile script.py ``` + +### cudf.pandas CLI Features + +Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin. + +Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter. + +The `-c` flag accepts a code string to run, like this: + +```bash +$ python -m cudf.pandas -c "import pandas; print(pandas)" + +``` + +Users can also provide code to execute from stdin, like this: + +```bash +$ echo "import pandas; print(pandas)" | python -m cudf.pandas + +``` diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst new file mode 100644 index 00000000000..0a3a0d86b2c --- /dev/null +++ b/docs/cudf/source/cudf_polars/index.rst @@ -0,0 +1,41 @@ +Polars GPU engine +================= + +cuDF provides an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API. +The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations +and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine +whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine +and run on the CPU. + +Benchmark +--------- +We reproduced the `Polars Decision Support (PDS) `__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results: + +.. figure:: ../_static/pds_benchmark_polars.png + :width: 600px + + + +You can see up to 13x speedup using the GPU engine on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries: + + +.. figure:: ../_static/compute_heavy_queries_polars.png + :width: 1000px + +:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe` + +You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository `__. + +Learn More +---------- + +The GPU engine for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page `__ on the Polars website. + +Launch on Google Colab +---------------------- + +.. figure:: ../_static/colab.png + :width: 200px + :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb + + Try out the GPU engine for Polars in a free GPU notebook environment. Sign in with your Google account and `launch the demo on Colab `__. diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst index 3b8dfa5fe01..1b86cafeb48 100644 --- a/docs/cudf/source/index.rst +++ b/docs/cudf/source/index.rst @@ -29,5 +29,6 @@ other operations. user_guide/index cudf_pandas/index + cudf_polars/index libcudf_docs/index developer_guide/index diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 2eaa75b3189..95f5f9734dd 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -5,9 +5,9 @@ "id": "4c6c548b", "metadata": {}, "source": [ - "# 10 Minutes to cuDF and Dask-cuDF\n", + "# 10 Minutes to cuDF and Dask cuDF\n", "\n", - "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n", + "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n", "\n", "## What are these Libraries?\n", "\n", @@ -18,13 +18,14 @@ "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n", "\n", "\n", - "> [!NOTE] \n", - "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n", + "
\n", + "Note: This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's configuration infrastructure to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the Dask cuDF documentation for more information.\n", + "
\n", "\n", "\n", - "## When to use cuDF and Dask-cuDF\n", + "## When to use cuDF and Dask cuDF\n", "\n", - "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF." + "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF." ] }, { @@ -115,7 +116,7 @@ "source": [ "ds = dask_cudf.from_cudf(s, npartitions=2)\n", "# Note the call to head here to show the first few entries, unlike\n", - "# cuDF objects, dask-cuDF objects do not have a printing\n", + "# cuDF objects, Dask-cuDF objects do not have a printing\n", "# representation that shows values since they may not be in local\n", "# memory.\n", "ds.head(n=3)" @@ -331,11 +332,11 @@ "id": "b17db919", "metadata": {}, "source": [ - "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n", + "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n", "\n", - "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n", + "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n", "\n", - "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*" + "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*" ] }, { @@ -1680,7 +1681,7 @@ "id": "7aa0089f", "metadata": {}, "source": [ - "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)." + "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)." ] }, { @@ -2393,7 +2394,7 @@ "id": "f6094cbe", "metadata": {}, "source": [ - "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." + "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe." ] }, { @@ -3492,7 +3493,7 @@ "id": "5ac3b004", "metadata": {}, "source": [ - "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF." + "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF." ] }, { @@ -4181,7 +4182,7 @@ "id": "aa8a445b", "metadata": {}, "source": [ - "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert." + "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert." ] }, { @@ -4899,7 +4900,7 @@ "id": "787eae14", "metadata": {}, "source": [ - "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU." + "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU." ] }, { diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst new file mode 100644 index 00000000000..06f74a38709 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/extract.rst @@ -0,0 +1,6 @@ +======= +extract +======= + +.. automodule:: pylibcudf.strings.extract + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst new file mode 100644 index 00000000000..9850ee10098 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/findall.rst @@ -0,0 +1,6 @@ +==== +find +==== + +.. automodule:: pylibcudf.strings.findall + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 462a756a092..9b1a6b72a88 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -7,8 +7,12 @@ strings capitalize char_types contains + extract find + findall regex_flags regex_program + repeat replace slice + strip diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst new file mode 100644 index 00000000000..0041fe4c3da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/repeat.rst @@ -0,0 +1,6 @@ +====== +repeat +====== + +.. automodule:: pylibcudf.strings.repeat + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst new file mode 100644 index 00000000000..a79774b8e67 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst @@ -0,0 +1,6 @@ +===== +strip +===== + +.. automodule:: pylibcudf.strings.strip + :members: diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst new file mode 100644 index 00000000000..142124163af --- /dev/null +++ b/docs/dask_cudf/source/best_practices.rst @@ -0,0 +1,320 @@ +.. _best-practices: + +Dask cuDF Best Practices +======================== + +This page outlines several important guidelines for using `Dask cuDF +`__ effectively. + +.. note:: + Since Dask cuDF is a backend extension for + `Dask DataFrame `__, + the guidelines discussed in the `Dask DataFrames Best Practices + `__ + documentation also apply to Dask cuDF (excluding any pandas-specific + details). + + +Deployment and Configuration +---------------------------- + +Use Dask-CUDA +~~~~~~~~~~~~~ + +To execute a Dask workflow on multiple GPUs, a Dask cluster must +be deployed with `Dask-CUDA `__ +and `Dask.distributed `__. + +When running on a single machine, the `LocalCUDACluster `__ +convenience function is strongly recommended. No matter how many GPUs are +available on the machine (even one!), using `Dask-CUDA has many advantages +`__ +over default (threaded) execution. Just to list a few: + +* Dask-CUDA makes it easy to pin workers to specific devices. +* Dask-CUDA makes it easy to configure memory-spilling options. +* The distributed scheduler collects useful diagnostic information that can be viewed on a dashboard in real time. + +Please see `Dask-CUDA's API `__ +and `Best Practices `__ +documentation for detailed information. Typical ``LocalCUDACluster`` usage +is also illustrated within the multi-GPU section of `Dask cuDF's +`__ documentation. + +.. note:: + When running on cloud infrastructure or HPC systems, it is usually best to + leverage system-specific deployment libraries like `Dask Operator + `__ and `Dask-Jobqueue + `__. + + Please see `the RAPIDS deployment documentation `__ + for further details and examples. + + +Use diagnostic tools +~~~~~~~~~~~~~~~~~~~~ + +The Dask ecosystem includes several diagnostic tools that you should absolutely use. +These tools include an intuitive `browser dashboard +`__ as well as a dedicated +`API for collecting performance profiles +`__. + +No matter the workflow, using the dashboard is strongly recommended. +It provides a visual representation of the worker resources and compute +progress. It also shows basic GPU memory and utilization metrics (under +the ``GPU`` tab). To visualize more detailed GPU metrics in JupyterLab, +use `NVDashboard `__. + + +Enable cuDF spilling +~~~~~~~~~~~~~~~~~~~~ + +When using Dask cuDF for classic ETL workloads, it is usually best +to enable `native spilling support in cuDF +`__. +When using :func:`LocalCUDACluster`, this is easily accomplished by +setting ``enable_cudf_spill=True``. + +When a Dask cuDF workflow includes conversion between DataFrame and Array +representations, native cuDF spilling may be insufficient. For these cases, +`JIT-unspill `__ +is likely to produce better protection from out-of-memory (OOM) errors. +Please see `Dask-CUDA's spilling documentation +`__ for further details +and guidance. + +Use RMM +~~~~~~~ + +Memory allocations in cuDF are significantly faster and more efficient when +the `RAPIDS Memory Manager (RMM) `__ +library is configured appropriately on worker processes. In most cases, the best way to manage +memory is by initializing an RMM pool on each worker before executing a +workflow. When using :func:`LocalCUDACluster`, this is easily accomplished +by setting ``rmm_pool_size`` to a large fraction (e.g. ``0.9``). + +See the `Dask-CUDA memory-management documentation +`__ +for more details. + +Use the Dask DataFrame API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Although Dask cuDF provides a public ``dask_cudf`` Python module, we +strongly recommended that you use the CPU/GPU portable ``dask.dataframe`` +API instead. Simply `use the Dask configuration system +`__ +to set the ``"dataframe.backend"`` option to ``"cudf"``, and the +``dask_cudf`` module will be imported and used implicitly. + +Be sure to use the :func:`to_backend` method if you need to convert +between the different DataFrame backends. For example:: + + df = df.to_backend("pandas") # This gives us a pandas-backed collection + +.. note:: + Although :func:`to_backend` makes it easy to move data between pandas + and cuDF, repetitive CPU-GPU data movement can degrade performance + significantly. For optimal results, keep your data on the GPU as much + as possible. + +Avoid eager execution +~~~~~~~~~~~~~~~~~~~~~ + +Although Dask DataFrame collections are lazy by default, there are several +notable methods that will result in the immediate execution of the +underlying task graph: + +:func:`compute`: Calling ``ddf.compute()`` will materialize the result of +``ddf`` and return a single cuDF object. This is done by executing the entire +task graph associated with ``ddf`` and concatenating its partitions in +local memory on the client process. + +.. note:: + Never call :func:`compute` on a large collection that cannot fit comfortably + in the memory of a single GPU! + +:func:`persist`: Like :func:`compute`, calling ``ddf.persist()`` will +execute the entire task graph associated with ``ddf``. The most important +difference is that the computed partitions will remain in distributed +worker memory instead of being concatenated together on the client process. +Another difference is that :func:`persist` will return immediately when +executing on a distributed cluster. If you need a blocking synchronization +point in your workflow, simply use the :func:`wait` function:: + + ddf = ddf.persist() + wait(ddf) + +.. note:: + Avoid calling :func:`persist` on a large collection that cannot fit comfortably + in global worker memory. If the total sum of the partition sizes is larger + than the sum of all GPU memory, calling persist will result in significant + spilling from device memory. If the individual partition sizes are large, this + is likely to produce an OOM error. + +:func:`len` / :func:`head` / :func:`tail`: Although these operations are used +often within pandas/cuDF code to quickly inspect data, it is best to avoid +them in Dask DataFrame. In most cases, these operations will execute some or all +of the underlying task graph to materialize the collection. + +:func:`sort_values` / :func:`set_index` : These operations both require Dask to +eagerly collect quantile information about the column(s) being targeted by the +global sort operation. See `Avoid Sorting`__ for notes on sorting considerations. + +.. note:: + When using :func:`set_index`, be sure to pass in ``sort=False`` whenever the + global collection does not **need** to be sorted by the new index. + +Avoid Sorting +~~~~~~~~~~~~~ + +`The design of Dask DataFrame `__ +makes it advantageous to work with data that is already sorted along its index at +creation time. For most other cases, it is best to avoid sorting unless the logic +of the workflow makes global ordering absolutely necessary. + +If the purpose of a :func:`sort_values` operation is to ensure that all unique +values in ``by`` will be moved to the same output partition, then `shuffle +`__ +is often the better option. + + +Reading Data +------------ + +Tune the partition size +~~~~~~~~~~~~~~~~~~~~~~~ + +The ideal partition size is usually between 1/32 and 1/8 the memory +capacity of a single GPU. Increasing the partition size will typically +reduce the number of tasks in your workflow and improve the GPU utilization +for each task. However, if the partitions are too large, the risk of OOM +errors can become significant. + +.. note:: + As a general rule of thumb, start with 1/32-1/16 for shuffle-intensive workflows + (e.g. large-scale sorting and joining), and 1/16-1/8 otherwise. For pathologically + skewed data distributions, it may be necessary to target 1/64 or smaller. + This rule of thumb comes from anecdotal optimization and OOM-debugging + experience. Since every workflow is different, choosing the best partition + size is both an art and a science. + +The easiest way to tune the partition size is when the DataFrame collection +is first created by a function like :func:`read_parquet`, :func:`read_csv`, +or :func:`from_map`. For example, both :func:`read_parquet` and :func:`read_csv` +expose a ``blocksize`` argument for adjusting the maximum partition size. + +If the partition size cannot be tuned effectively at creation time, the +`repartition `__ +method can be used as a last resort. + + +Use Parquet +~~~~~~~~~~~ + +`Parquet `__ is the recommended +file format for Dask cuDF. It provides efficient columnar storage and enables +Dask to perform valuable query optimizations like column projection and +predicate pushdown. + +The most important arguments to :func:`read_parquet` are ``blocksize`` and +``aggregate_files``: + +``blocksize``: Use this argument to specify the maximum partition size. +The default is `"256 MiB"`, but larger values are usually more performant +on GPUs with more than 8 GiB of memory. Dask will use the ``blocksize`` +value to map a discrete number of Parquet row-groups (or files) to each +output partition. This mapping will only account for the uncompressed +storage size of each row group, which is usually smaller than the +correspondng ``cudf.DataFrame``. + +``aggregate_files``: Use this argument to specify whether Dask should +map multiple files to the same DataFrame partition. The default is +``False``, but ``aggregate_files=True`` is usually more performant when +the dataset contains many files that are smaller than half of ``blocksize``. + +If you know that your files correspond to a reasonable partition size +before splitting or aggregation, set ``blocksize=None`` to disallow +file splitting. In the absence of column-projection pushdown, this will +result in a simple 1-to-1 mapping between files and output partitions. + +.. note:: + If your workflow requires a strict 1-to-1 mapping between files and + partitions, use :func:`from_map` to manually construct your partitions + with ``cudf.read_parquet``. When :func:`dd.read_parquet` is used, + query-planning optimizations may automatically aggregate distinct files + into the same partition (even when ``aggregate_files=False``). + +.. note:: + Metadata collection can be extremely slow when reading from remote + storage (e.g. S3 and GCS). When reading many remote files that all + correspond to a reasonable partition size, use ``blocksize=None`` + to avoid unnecessary metadata collection. + + +Use :func:`from_map` +~~~~~~~~~~~~~~~~~~~~ + +To implement custom DataFrame-creation logic that is not covered by +existing APIs (like :func:`read_parquet`), use :func:`dask.dataframe.from_map` +whenever possible. The :func:`from_map` API has several advantages +over :func:`from_delayed`: + +* It allows proper lazy execution of your custom logic +* It enables column projection (as long as the mapped function supports a ``columns`` key-word argument) + +See the `from_map API documentation `__ +for more details. + +.. note:: + Whenever possible, be sure to specify the ``meta`` argument to + :func:`from_map`. If this argument is excluded, Dask will need to + materialize the first partition eagerly. If a large RMM pool is in + use on the first visible device, this eager execution on the client + may lead to an OOM error. + + +Sorting, Joining, and Grouping +------------------------------ + +Sorting, joining, and grouping operations all have the potential to +require the global shuffling of data between distinct partitions. +When the initial data fits comfortably in global GPU memory, these +"all-to-all" operations are typically bound by worker-to-worker +communication. When the data is larger than global GPU memory, the +bottleneck is typically device-to-host memory spilling. + +Although every workflow is different, the following guidelines +are often recommended: + +* `Use a distributed cluster with Dask-CUDA workers `_ +* `Use native cuDF spilling whenever possible `_ +* Avoid shuffling whenever possible + * Use ``split_out=1`` for low-cardinality groupby aggregations + * Use ``broadcast=True`` for joins when at least one collection comprises a small number of partitions (e.g. ``<=5``) +* `Use UCX `__ if communication is a bottleneck. + +.. note:: + UCX enables Dask-CUDA workers to communicate using high-performance + tansport technologies like `NVLink `__ + and Infiniband. Without UCX, inter-process communication will rely + on TCP sockets. + + +User-defined functions +---------------------- + +Most real-world Dask DataFrame workflows use `map_partitions +`__ +to map user-defined functions across every partition of the underlying data. +This API is a fantastic way to apply custom operations in an intuitive and +scalable way. With that said, the :func:`map_partitions` method will produce +an opaque DataFrame expression that blocks the query-planning `optimizer +`__ from performing +useful optimizations (like projection and filter pushdown). + +Since column-projection pushdown is often the most effective optimization, +it is important to select the necessary columns both before and after calling +:func:`map_partitions`. You can also add explicit filter operations to further +mitigate the loss of filter pushdown. diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 9a216690384..23ca7e49753 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -3,39 +3,46 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Welcome to dask-cudf's documentation! +Welcome to Dask cuDF's documentation! ===================================== -**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension +**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension library for the `Dask `__ parallel computing -framework that provides a `cuDF -`__-backed distributed -dataframe with the same API as `Dask dataframes -`__. +framework. When installed, Dask cuDF is automatically registered +as the ``"cudf"`` dataframe backend for +`Dask DataFrame `__. + +.. note:: + Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU + or multi-node execution on their own. You must also deploy a + `dask.distributed `__ cluster + to leverage multiple GPUs. We strongly recommend using `Dask-CUDA + `__ to simplify the + setup of the cluster, taking advantage of all features of the GPU + and networking hardware. If you are familiar with Dask and `pandas `__ or -`cuDF `__, then Dask-cuDF +`cuDF `__, then Dask cuDF should feel familiar to you. If not, we recommend starting with `10 minutes to Dask `__ followed -by `10 minutes to cuDF and Dask-cuDF +by `10 minutes to cuDF and Dask cuDF `__. -When running on multi-GPU systems, `Dask-CUDA -`__ is recommended to -simplify the setup of the cluster, taking advantage of all features of -the GPU and networking hardware. +After reviewing the sections below, please see the +:ref:`Best Practices ` page for further guidance on +using Dask cuDF effectively. -Using Dask-cuDF + +Using Dask cuDF --------------- -When installed, Dask-cuDF registers itself as a dataframe backend for -Dask. This means that in many cases, using cuDF-backed dataframes requires -only small changes to an existing workflow. The minimal change is to -select cuDF as the dataframe backend in :doc:`Dask's -configuration `. To do so, we must set the option -``dataframe.backend`` to ``cudf``. From Python, this can be achieved -like so:: +The Dask DataFrame API (Recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Simply use the `Dask configuration `__ system to +set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, +this can be achieved like so:: import dask @@ -44,52 +51,157 @@ like so:: Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the environment before running your code. -Dataframe creation from on-disk formats -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If your workflow creates Dask dataframes from on-disk formats -(for example using :func:`dask.dataframe.read_parquet`), then setting -the backend may well be enough to migrate your workflow. +Once this is done, the public Dask DataFrame API will leverage +``cudf`` automatically when a new DataFrame collection is created +from an on-disk format using any of the following ``dask.dataframe`` +functions: -For example, consider reading a dataframe from parquet:: +* :func:`read_parquet` +* :func:`read_json` +* :func:`read_csv` +* :func:`read_orc` +* :func:`read_hdf` +* :func:`from_dict` - import dask.dataframe as dd - - # By default, we obtain a pandas-backed dataframe - df = dd.read_parquet("data.parquet", ...) +For example:: + import dask.dataframe as dd -To obtain a cuDF-backed dataframe, we must set the -``dataframe.backend`` configuration option:: + # By default, we obtain a pandas-backed dataframe + df = dd.read_parquet("data.parquet", ...) import dask - import dask.dataframe as dd dask.config.set({"dataframe.backend": "cudf"}) - # This gives us a cuDF-backed dataframe + # This now gives us a cuDF-backed dataframe df = dd.read_parquet("data.parquet", ...) -This code will use cuDF's GPU-accelerated :func:`parquet reader -` to read partitions of the data. +When other functions are used to create a new collection +(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`, +and :func:`from_array`), the backend of the new collection will +depend on the inputs to those functions. For example:: + + import pandas as pd + import cudf + + # This gives us a pandas-backed dataframe + dd.from_pandas(pd.DataFrame({"a": range(10)})) + + # This gives us a cuDF-backed dataframe + dd.from_pandas(cudf.DataFrame({"a": range(10)})) + +An existing collection can always be moved to a specific backend +using the :func:`dask.dataframe.DataFrame.to_backend` API:: + + # This ensures that we have a cuDF-backed dataframe + df = df.to_backend("cudf") + + # This ensures that we have a pandas-backed dataframe + df = df.to_backend("pandas") + +The explicit Dask cuDF API +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to providing the ``"cudf"`` backend for Dask DataFrame, +Dask cuDF also provides an explicit ``dask_cudf`` API:: + + import dask_cudf + + # This always gives us a cuDF-backed dataframe + df = dask_cudf.read_parquet("data.parquet", ...) + +This API is used implicitly by the Dask DataFrame API when the ``"cudf"`` +backend is enabled. Therefore, using it directly will not provide any +performance benefit over the CPU/GPU-portable ``dask.dataframe`` API. +Also, using some parts of the explicit API are incompatible with +automatic query planning (see the next section). + +Query Planning +~~~~~~~~~~~~~~ + +Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). +As long as the ``"dataframe.query-planning"`` configuration is set to +``True`` (the default) when ``dask.dataframe`` is first imported, `Dask +Expressions `__ will be used under the hood. + +For example, the following code will automatically benefit from predicate +pushdown when the result is computed:: + + df = dd.read_parquet("/my/parquet/dataset/") + result = df.sort_values('B')['A'] + +Unoptimized expression graph (``df.pprint()``):: + + Projection: columns='A' + SortValues: by=['B'] shuffle_method='tasks' options={} + ReadParquetFSSpec: path='/my/parquet/dataset/' ... + +Simplified expression graph (``df.simplify().pprint()``):: + + Projection: columns='A' + SortValues: by=['B'] shuffle_method='tasks' options={} + ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ... + +.. note:: + Dask will automatically simplify the expression graph (within + :func:`optimize`) when the result is converted to a task graph + (via :func:`compute` or :func:`persist`). You do not need to call + :func:`simplify` yourself. + + +Using Multiple GPUs and Multiple Nodes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try +to partition your data into small-enough tasks to fit comfortably in the +memory of a single GPU. This means the necessary compute tasks needed to +compute a query can often be streamed to a single GPU process for +out-of-core computing. This also means that the compute tasks can be +executed in parallel over a multi-GPU cluster. + +In order to execute your Dask workflow on multiple GPUs, you will +typically need to use `Dask-CUDA `__ +to deploy distributed Dask cluster, and +`Distributed `__ +to define a client object. For example:: + + from dask_cuda import LocalCUDACluster + from distributed import Client + + if __name__ == "__main__": + + client = Client( + LocalCUDACluster( + CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) + rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations + enable_cudf_spill=True, # Improve device memory stability + local_directory="/fast/scratch/", # Use fast local storage for spilling + ) + ) + + df = dd.read_parquet("/my/parquet/dataset/") + agg = df.groupby('B').sum() + agg.compute() # This will use the cluster defined above + +.. note:: + This example uses :func:`compute` to materialize a concrete + ``cudf.DataFrame`` object in local memory. Never call :func:`compute` + on a large collection that cannot fit comfortably in the memory of a + single GPU! See Dask's `documentation on managing computation + `__ + for more details. -Dataframe creation from in-memory formats -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Please see the `Dask-CUDA `__ +documentation for more information about deploying GPU-aware clusters +(including `best practices +`__). -If you already have a dataframe in memory and want to convert it to a -cuDF-backend one, there are two options depending on whether the -dataframe is already a Dask one or not. If you have a Dask dataframe, -then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"`` -as the backend; if you have a pandas dataframe then you can either -call :func:`dask.dataframe.from_pandas` followed by -:func:`~dask.dataframe.to_backend` or first convert the dataframe with -:func:`cudf.from_pandas` and then parallelise this with -:func:`dask_cudf.from_cudf`. API Reference ------------- -Generally speaking, Dask-cuDF tries to offer exactly the same API as -Dask itself. There are, however, some minor differences mostly because +Generally speaking, Dask cuDF tries to offer exactly the same API as +Dask DataFrame. There are, however, some minor differences mostly because cuDF does not :doc:`perfectly mirror ` the pandas API, or because cuDF provides additional configuration flags (these mostly occur in data reading and writing interfaces). @@ -97,7 +209,7 @@ flags (these mostly occur in data reading and writing interfaces). As a result, straightforward workflows can be migrated without too much trouble, but more complex ones that utilise more features may need a bit of tweaking. The API documentation describes details of the -differences and all functionality that Dask-cuDF supports. +differences and all functionality that Dask cuDF supports. .. toctree:: :maxdepth: 2 diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index 5a0fbd224ad..6a0f0f6f169 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount, od, vd, nullCount, numRows, childHandles); } - static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { + /** + * Creates a ColumnVector from a native column_view using a contiguous device allocation. + * + * @param columnViewAddress address of the native column_view + * @param buffer device buffer containing the data referenced by the column view + */ + public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) { return new ColumnVector(columnViewAddress, buffer); } diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index c8308ca17ec..2bb74c3e3b1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -38,6 +38,8 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowLeadingZeros; private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; + private final boolean cudfPruneSchema; + private final byte lineDelimiter; private JSONOptions(Builder builder) { super(builder); @@ -52,6 +54,16 @@ private JSONOptions(Builder builder) { allowLeadingZeros = builder.allowLeadingZeros; allowNonNumericNumbers = builder.allowNonNumericNumbers; allowUnquotedControlChars = builder.allowUnquotedControlChars; + cudfPruneSchema = builder.cudfPruneSchema; + lineDelimiter = builder.lineDelimiter; + } + + public boolean shouldCudfPruneSchema() { + return cudfPruneSchema; + } + + public byte getLineDelimiter() { + return lineDelimiter; } public boolean isDayFirst() { @@ -123,6 +135,22 @@ public static final class Builder extends ColumnFilterOptions.Builder Byte.MAX_VALUE) { + throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter); + } + lineDelimiter = (byte)delimiter; + return this; + } + /** * Should json validation be strict or not */ diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java index 7b58817550d..8c8180436e6 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,9 +24,13 @@ */ public final class ParquetWriterOptions extends CompressionMetadataWriterOptions { private final StatisticsFrequency statsGranularity; + private int rowGroupSizeRows; + private long rowGroupSizeBytes; private ParquetWriterOptions(Builder builder) { super(builder); + this.rowGroupSizeRows = builder.rowGroupSizeRows; + this.rowGroupSizeBytes = builder.rowGroupSizeBytes; this.statsGranularity = builder.statsGranularity; } @@ -51,18 +55,38 @@ public static Builder builder() { return new Builder(); } + public int getRowGroupSizeRows() { + return rowGroupSizeRows; + } + + public long getRowGroupSizeBytes() { + return rowGroupSizeBytes; + } + public StatisticsFrequency getStatisticsFrequency() { return statsGranularity; } public static class Builder extends CompressionMetadataWriterOptions.Builder { + private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group + private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP; public Builder() { super(); } + public Builder withRowGroupSizeRows(int rowGroupSizeRows) { + this.rowGroupSizeRows = rowGroupSizeRows; + return this; + } + + public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) { + this.rowGroupSizeBytes = rowGroupSizeBytes; + return this; + } + public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) { this.statsGranularity = statsGranularity; return this; diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index cbb126d7ee5..6d370ca27b2 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -258,7 +258,9 @@ private static native long readJSON(int[] numChildren, String[] columnNames, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + boolean pruneColumns, + byte lineDelimiter) throws CudfException; private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames, int[] dTypeIds, int[] dTypeScales, @@ -272,6 +274,8 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + boolean pruneColumns, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines, @@ -284,6 +288,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool boolean allowLeadingZeros, boolean allowNonNumericNumbers, boolean allowUnquotedControl, + byte lineDelimiter, long dsHandle) throws CudfException; private static native long readAndInferJSON(long address, long length, @@ -297,7 +302,8 @@ private static native long readAndInferJSON(long address, long length, boolean strictValidation, boolean allowLeadingZeros, boolean allowNonNumericNumbers, - boolean allowUnquotedControl) throws CudfException; + boolean allowUnquotedControl, + byte lineDelimiter) throws CudfException; /** * Read in Parquet formatted data. @@ -332,20 +338,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames, /** * Setup everything to write parquet formatted data to a file. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param filename local output path + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param filename local output path * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetFileBegin(String[] columnNames, @@ -355,6 +363,8 @@ private static native long writeParquetFileBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -366,20 +376,22 @@ private static native long writeParquetFileBegin(String[] columnNames, /** * Setup everything to write parquet formatted data to a buffer. - * @param columnNames names that correspond to the table columns - * @param numChildren Children of the top level - * @param flatNumChildren flattened list of children per column - * @param nullable true if the column can have nulls else false - * @param metadataKeys Metadata key names to place in the Parquet file - * @param metadataValues Metadata values corresponding to metadataKeys - * @param compression native compression codec ID - * @param statsFreq native statistics frequency ID - * @param isInt96 true if timestamp type is int96 - * @param precisions precision list containing all the precisions of the decimal types in - * the columns - * @param isMapValues true if a column is a map - * @param isBinaryValues true if a column is a binary - * @param consumer consumer of host buffers produced. + * @param columnNames names that correspond to the table columns + * @param numChildren Children of the top level + * @param flatNumChildren flattened list of children per column + * @param nullable true if the column can have nulls else false + * @param metadataKeys Metadata key names to place in the Parquet file + * @param metadataValues Metadata values corresponding to metadataKeys + * @param compression native compression codec ID + * @param rowGroupSizeRows max #rows in a row group + * @param rowGroupSizeBytes max #bytes in a row group + * @param statsFreq native statistics frequency ID + * @param isInt96 true if timestamp type is int96 + * @param precisions precision list containing all the precisions of the decimal types in + * the columns + * @param isMapValues true if a column is a map + * @param isBinaryValues true if a column is a binary + * @param consumer consumer of host buffers produced. * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd. */ private static native long writeParquetBufferBegin(String[] columnNames, @@ -389,6 +401,8 @@ private static native long writeParquetBufferBegin(String[] columnNames, String[] metadataKeys, String[] metadataValues, int compression, + int rowGroupSizeRows, + long rowGroupSizeBytes, int statsFreq, boolean[] isInt96, int[] precisions, @@ -1300,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp * @return the file parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, File path) { + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta( readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -1313,7 +1331,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, -1); } @@ -1396,7 +1416,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer, opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars())); + opts.unquotedControlChars(), + opts.getLineDelimiter())); } /** @@ -1418,6 +1439,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + opts.getLineDelimiter(), dsHandle)); return twm; } finally { @@ -1457,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b assert len > 0; assert len <= buffer.length - offset; assert offset >= 0 && offset < buffer.length; + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSON( schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null, @@ -1471,7 +1497,9 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.strictValidation(), opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), - opts.unquotedControlChars()))) { + opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter()))) { return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1497,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { */ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); + // only prune the schema if one is provided + boolean cudfPruneSchema = schema.getColumnNames() != null && + schema.getColumnNames().length != 0 && + opts.shouldCudfPruneSchema(); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), @@ -1510,6 +1542,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int opts.leadingZerosAllowed(), opts.nonNumericNumbersAllowed(), opts.unquotedControlChars(), + cudfPruneSchema, + opts.getLineDelimiter(), dsHandle))) { return gatherJSONColumns(schema, twm, emptyRowCount); } finally { @@ -1820,6 +1854,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) { options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), @@ -1840,6 +1876,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons options.getMetadataKeys(), options.getMetadataValues(), options.getCompressionType().nativeId, + options.getRowGroupSizeRows(), + options.getRowGroupSizeBytes(), options.getStatisticsFrequency().nativeId, options.getFlatIsTimeTypeInt96(), options.getFlatPrecision(), diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 40a111209b0..0f77da54152 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1646,8 +1647,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(false); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1676,7 +1679,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jbyte line_delimiter) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1700,6 +1704,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) + .prune_columns(false) + .delimiter(static_cast(line_delimiter)) .keep_quotes(keep_quotes); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) @@ -1814,6 +1820,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jboolean prune_columns, + jbyte line_delimiter, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1848,8 +1856,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1908,7 +1918,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean strict_validation, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) + jboolean allow_unquoted_control, + jboolean prune_columns, + jbyte line_delimiter) { bool read_buffer = true; if (buffer == 0) { @@ -1957,8 +1969,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) - .keep_quotes(keep_quotes); + .keep_quotes(keep_quotes) + .prune_columns(prune_columns); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -2150,6 +2164,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2205,6 +2221,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) @@ -2227,6 +2245,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression, + jint j_row_group_size_rows, + jlong j_row_group_size_bytes, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions, @@ -2280,6 +2300,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env, chunked_parquet_writer_options::builder(sink) .metadata(std::move(metadata)) .compression(static_cast(j_compression)) + .row_group_size_rows(j_row_group_size_rows) + .row_group_size_bytes(j_row_group_size_bytes) .stats_level(static_cast(j_stats_freq)) .key_value_metadata({kv_metadata}) .compression_statistics(stats) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 56fe63598d9..c7fcb1756b6 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -40,7 +40,6 @@ import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.OriginalType; -import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; import java.io.*; @@ -656,6 +655,24 @@ void testJSONValidationUnquotedControl() { } } + private static final byte[] CR_JSON_TEST_BUFFER = ("{\"a\":\"12\n3\"}\0" + + "{\"a\":\"AB\nC\"}\0").getBytes(StandardCharsets.UTF_8); + + @Test + void testReadJSONDelim() { + Schema schema = Schema.builder().addColumn(DType.STRING, "a").build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withLineDelimiter('\0') + .build(); + try (Table expected = new Table.TestBuilder() + .column("12\n3", "AB\nC") + .build(); + Table found = Table.readJSON(schema, opts, CR_JSON_TEST_BUFFER)) { + assertTablesAreEqual(expected, found); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + @@ -9122,7 +9139,11 @@ void testParquetWriteToBufferChunked() { columns.add(Columns.STRUCT.name); WriteUtils.buildWriterOptions(optBuilder, columns); ParquetWriterOptions options = optBuilder.build(); - ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build(); + ParquetWriterOptions optionsNoCompress = + optBuilder.withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) + .build(); try (Table table0 = getExpectedFileTable(columns); MyBufferConsumer consumer = new MyBufferConsumer()) { try (TableWriter writer = Table.writeParquetChunked(options, consumer)) { @@ -9208,6 +9229,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException { .withDecimalColumn("_c7", 4) .withDecimalColumn("_c8", 6) .withCompressionType(CompressionType.NONE) + .withRowGroupSizeRows(10000) + .withRowGroupSizeBytes(10000) .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE) .build(); try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) { diff --git a/python/cudf/benchmarks/pytest.ini b/python/cudf/benchmarks/pytest.ini index db24415ef9e..187d91996b2 100644 --- a/python/cudf/benchmarks/pytest.ini +++ b/python/cudf/benchmarks/pytest.ini @@ -6,3 +6,4 @@ python_classes = Bench python_functions = bench_* markers = pandas_incompatible: mark a benchmark that cannot be run with pandas +addopts = --tb=native diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index e27c595edda..99e4c21df8a 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -599,7 +599,6 @@ cdef class Column: children=tuple(children) ) - # TODO: Actually support exposed data pointers. @staticmethod def from_pylibcudf( col, bint data_ptr_exposed=False @@ -616,7 +615,7 @@ cdef class Column: col : pylibcudf.Column The object to copy. data_ptr_exposed : bool - This parameter is not yet supported + Whether the data buffer is exposed. Returns ------- @@ -639,16 +638,18 @@ cdef class Column: dtype = dtype_from_pylibcudf_column(col) return cudf.core.column.build_column( - data=as_buffer(col.data().obj) if col.data() is not None else None, + data=as_buffer( + col.data().obj, exposed=data_ptr_exposed + ) if col.data() is not None else None, dtype=dtype, size=col.size(), mask=as_buffer( - col.null_mask().obj + col.null_mask().obj, exposed=data_ptr_exposed ) if col.null_mask() is not None else None, offset=col.offset(), null_count=col.null_count(), children=tuple([ - Column.from_pylibcudf(child) + Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) for child in col.children() ]) ) diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index e661059faa3..e6c2d136f0d 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -23,9 +23,9 @@ def concat_columns(object columns): def concat_tables(object tables, bool ignore_index=False): plc_tables = [] for table in tables: - cols = table._data.columns + cols = table._columns if not ignore_index: - cols = table._index._data.columns + cols + cols = table._index._columns + cols plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) return data_from_pylibcudf_table( diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 16182e31c08..49714091f46 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -384,7 +384,7 @@ cdef class _CPackedColumns: p.column_names = input_table._column_names p.column_dtypes = {} - for name, col in input_table._data.items(): + for name, col in input_table._column_labels_and_values: if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): p.column_dtypes[name] = col.dtype diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 058e884e08b..9ad96f610b3 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -273,7 +273,7 @@ def read_csv( elif isinstance(dtype, abc.Collection): for index, col_dtype in enumerate(dtype): if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._data.names[index] + col_name = df._column_names[index] df._data[col_name] = df._data[col_name].astype(col_dtype) if names is not None and len(names) and isinstance(names[0], int): diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 483250dd36f..bc5e085ec39 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc + @acquire_spill_lock() def add_months(Column col, Column months): @@ -38,43 +40,9 @@ def add_months(Column col, Column months): @acquire_spill_lock() def extract_datetime_component(Column col, object field): - - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - if field == "year": - c_result = move(libcudf_datetime.extract_year(col_view)) - elif field == "month": - c_result = move(libcudf_datetime.extract_month(col_view)) - elif field == "day": - c_result = move(libcudf_datetime.extract_day(col_view)) - elif field == "weekday": - c_result = move(libcudf_datetime.extract_weekday(col_view)) - elif field == "hour": - c_result = move(libcudf_datetime.extract_hour(col_view)) - elif field == "minute": - c_result = move(libcudf_datetime.extract_minute(col_view)) - elif field == "second": - c_result = move(libcudf_datetime.extract_second(col_view)) - elif field == "millisecond": - c_result = move( - libcudf_datetime.extract_millisecond_fraction(col_view) - ) - elif field == "microsecond": - c_result = move( - libcudf_datetime.extract_microsecond_fraction(col_view) - ) - elif field == "nanosecond": - c_result = move( - libcudf_datetime.extract_nanosecond_fraction(col_view) - ) - elif field == "day_of_year": - c_result = move(libcudf_datetime.day_of_year(col_view)) - else: - raise ValueError(f"Invalid datetime field: '{field}'") - - result = Column.from_unique_ptr(move(c_result)) + result = Column.from_pylibcudf( + plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) + ) if field == "weekday": # Pandas counts Monday-Sunday as 0-6 diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index b1900138d94..564daefbae2 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -179,7 +179,7 @@ cdef update_struct_field_names( ): # Deprecated, remove in favor of add_col_struct_names # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._data.items()): + for i, (name, col) in enumerate(table._column_labels_and_values): table._data[name] = update_column_struct_field_names( col, schema_info[i] ) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5ee15d0e409..59cb8d51440 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, ) from pylibcudf.libcudf.types cimport size_type @@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width): ) return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) + + +@acquire_spill_lock() +def word_minhash64(Column input, Column seeds): + + cdef column_view c_input = input.view() + cdef column_view c_seeds = seeds.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_word_minhash64( + c_input, + c_seeds + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index a0155671a26..fa2690c7f21 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -235,16 +235,16 @@ cdef object _process_metadata(object df, df._index = idx elif set(index_col).issubset(names): index_data = df[index_col] - actual_index_names = list(index_col_names.values()) - if len(index_data._data) == 1: + actual_index_names = iter(index_col_names.values()) + if index_data._num_columns == 1: idx = cudf.Index._from_column( - index_data._data.columns[0], - name=actual_index_names[0] + index_data._columns[0], + name=next(actual_index_names) ) else: idx = cudf.MultiIndex.from_frame( index_data, - names=actual_index_names + names=list(actual_index_names) ) df.drop(columns=index_col, inplace=True) df._index = idx @@ -252,7 +252,7 @@ cdef object _process_metadata(object df, if use_pandas_metadata: df.index.names = index_col - if len(df._data.names) == 0 and column_index_type is not None: + if df._num_columns == 0 and column_index_type is not None: df._data.label_dtype = cudf.dtype(column_index_type) return df @@ -438,7 +438,7 @@ def write_parquet( object statistics="ROWGROUP", object metadata_file_path=None, object int96_timestamps=False, - object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + object row_group_size_bytes=None, object row_group_size_rows=None, object max_page_size_bytes=None, object max_page_size_rows=None, @@ -616,9 +616,9 @@ cdef class ParquetWriter: Name of the compression to use. Use ``None`` for no compression. statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' Level at which column statistics should be included in file. - row_group_size_bytes: int, default 134217728 + row_group_size_bytes: int, default ``uint64 max`` Maximum size of each stripe of the output. - By default, 134217728 (128MB) will be used. + By default, a virtually infinite size equal to ``uint64 max`` will be used. row_group_size_rows: int, default 1000000 Maximum number of rows of each stripe of the output. By default, 1000000 (10^6 rows) will be used. @@ -661,11 +661,11 @@ cdef class ParquetWriter: def __cinit__(self, object filepath_or_buffer, object index=None, object compression="snappy", str statistics="ROWGROUP", - int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - int row_group_size_rows=1000000, - int max_page_size_bytes=524288, - int max_page_size_rows=20000, - int max_dictionary_size=1048576, + size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, + size_type row_group_size_rows=1000000, + size_t max_page_size_bytes=524288, + size_type max_page_size_rows=20000, + size_t max_dictionary_size=1048576, bool use_dictionary=True, bool store_schema=False): filepaths_or_buffers = ( diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 8d463829a19..60a6795a402 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -20,13 +20,7 @@ from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( to_booleans as cpp_to_booleans, ) from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - from_timestamps as cpp_from_timestamps, is_timestamp as cpp_is_timestamp, - to_timestamps as cpp_to_timestamps, -) -from pylibcudf.libcudf.strings.convert.convert_durations cimport ( - from_durations as cpp_from_durations, - to_durations as cpp_to_durations, ) from pylibcudf.libcudf.strings.convert.convert_floats cimport ( from_floats as cpp_from_floats, @@ -48,8 +42,12 @@ from pylibcudf.libcudf.types cimport data_type, type_id from cudf._lib.types cimport underlying_type_t_type_id +import pylibcudf as plc + import cudf +from cudf._lib.types cimport dtype_to_pylibcudf_type + def floating_to_string(Column input_col): cdef column_view input_column_view = input_col.view() @@ -522,19 +520,14 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef column_view input_column_view = input_col.view() cdef string c_timestamp_format = format.encode("UTF-8") - cdef column_view input_strings_names = names.view() - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_timestamps( - input_column_view, - c_timestamp_format, - input_strings_names)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.from_timestamps( + input_col.to_pylibcudf(mode="read"), + c_timestamp_format, + names.to_pylibcudf(mode="read") + ) + ) def timestamp2int(Column input_col, dtype, format): @@ -551,23 +544,15 @@ def timestamp2int(Column input_col, dtype, format): A Column with string represented in date-time format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_datetime.to_timestamps( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_timestamp_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_timestamps( - input_column_view, - out_type, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) def istimestamp(Column input_col, str format): @@ -613,23 +598,15 @@ def timedelta2int(Column input_col, dtype, format): A Column with string represented in TimeDelta format """ - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[dtype] + dtype = dtype_to_pylibcudf_type(dtype) + cdef string c_timestamp_format = format.encode('UTF-8') + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.to_durations( + input_col.to_pylibcudf(mode="read"), + dtype, + c_timestamp_format ) ) - cdef data_type out_type = data_type(tid) - cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_durations( - input_column_view, - out_type, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) def int2timedelta(Column input_col, str format): @@ -647,16 +624,13 @@ def int2timedelta(Column input_col, str format): """ - cdef column_view input_column_view = input_col.view() cdef string c_duration_format = format.encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_durations( - input_column_view, - c_duration_format)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + plc.strings.convert.convert_durations.from_durations( + input_col.to_pylibcudf(mode="read"), + c_duration_format + ) + ) def int2ip(Column input_col): diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 47a194c4fda..4bf8a9b1a8f 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, @@ -6,7 +6,12 @@ hash_character_ngrams, ) from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import minhash, minhash64 +from cudf._lib.nvtext.minhash import ( + minhash, + minhash64, + word_minhash, + word_minhash64, +) from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces from cudf._lib.nvtext.replace import filter_tokens, replace_tokens diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx index 82f5e06c547..03b4887f200 100644 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ b/python/cudf/cudf/_lib/strings/contains.pyx @@ -1,27 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.contains cimport ( - count_re as cpp_count_re, - like as cpp_like, - matches_re as cpp_matches_re, -) -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar from pylibcudf.strings import contains from pylibcudf.strings.regex_program import RegexProgram @@ -45,21 +28,10 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with count of occurrences of `reg_ex` in each string of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_count_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -68,21 +40,10 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags): Returns a Column with each value True if the string matches `reg_ex` regular expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string reg_ex_string = str(reg_ex).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(reg_ex_string, c_flags)) - c_result = move(cpp_matches_re( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = RegexProgram.create(str(reg_ex), flags) + return Column.from_pylibcudf( + contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) + ) @acquire_spill_lock() @@ -91,24 +52,9 @@ def like(Column source_strings, object py_pattern, object py_escape): Returns a Column with each value True if the string matches the `py_pattern` like expression with each record of `source_strings` """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef DeviceScalar pattern = py_pattern.device_value - cdef DeviceScalar escape = py_escape.device_value - - cdef const string_scalar* scalar_ptn = ( - pattern.get_raw_ptr() - ) - cdef const string_scalar* scalar_esc = ( - escape.get_raw_ptr() + plc_column = contains.like( + source_strings.to_pylibcudf(mode="read"), + py_pattern.device_value.c_value, + py_escape.device_value.c_value, ) - - with nogil: - c_result = move(cpp_like( - source_view, - scalar_ptn[0], - scalar_esc[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx index 63f4d57e562..5bf336f4f3c 100644 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ b/python/cudf/cudf/_lib/strings/extract.pyx @@ -1,21 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -26,21 +17,8 @@ def extract(Column source_strings, object pattern, uint32_t flags): The returning data contains one row for each subject string, and one column for each group. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_extract( - source_view, - dereference(c_prog) - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) + plc_result = plc.strings.extract.extract( + source_strings.to_pylibcudf(mode="read"), prog ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 3cf2084e30a..0e758d5b322 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -1,21 +1,13 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference from libc.stdint cimport uint32_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def findall(Column source_strings, object pattern, uint32_t flags): @@ -23,18 +15,11 @@ def findall(Column source_strings, object pattern, uint32_t flags): Returns data with all non-overlapping matches of `pattern` in each string of `source_strings` as a lists column. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = flags - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_findall( - source_view, - dereference(c_prog) - )) - - return Column.from_unique_ptr(move(c_result)) + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.findall( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx index 42fcfa5d94e..43649d4defe 100644 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ b/python/cudf/cudf/_lib/strings/repeat.pyx @@ -1,17 +1,12 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings cimport repeat as cpp_repeat from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def repeat_scalar(Column source_strings, @@ -21,16 +16,11 @@ def repeat_scalar(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats + ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -41,14 +31,8 @@ def repeat_sequence(Column source_strings, each string in `source_strings` `repeats` number of times. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repeats_view = repeats.view() - - with nogil: - c_result = move(cpp_repeat.repeat_strings( - source_view, - repeats_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.repeat.repeat_strings( + source_strings.to_pylibcudf(mode="read"), + repeats.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index acf52cb7b9f..38ecb21a94c 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -13,6 +13,7 @@ from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar +import pylibcudf as plc @acquire_spill_lock() @@ -25,23 +26,14 @@ def strip(Column source_strings, """ cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + return Column.from_pylibcudf( + plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.SideType.BOTH, + repl.c_value + ) ) - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.BOTH, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) - @acquire_spill_lock() def lstrip(Column source_strings, diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index baa08a545ec..40d0c9eac3a 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -3,41 +3,26 @@ from numba.np import numpy_support import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import acquire_spill_lock, as_buffer from cudf.utils import cudautils from cython.operator cimport dereference -from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.string cimport string from libcpp.utility cimport move cimport pylibcudf.libcudf.transform as libcudf_transform from pylibcudf cimport transform as plc_transform from pylibcudf.expressions cimport Expression from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.expressions cimport expression -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport ( - bitmask_type, - data_type, - size_type, - type_id, -) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.utils cimport ( - columns_from_unique_ptr, - data_from_table_view, - table_view_from_columns, -) +from cudf._lib.utils cimport table_view_from_columns + +import pylibcudf as plc @acquire_spill_lock() @@ -46,17 +31,8 @@ def bools_to_mask(Column col): Given an int8 (boolean) column, compress the data from booleans to bits and return a Buffer """ - cdef column_view col_view = col.view() - cdef pair[unique_ptr[device_buffer], size_type] cpp_out - cdef unique_ptr[device_buffer] up_db - - with nogil: - cpp_out = move(libcudf_transform.bools_to_mask(col_view)) - up_db = move(cpp_out.first) - - rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db)) - buf = as_buffer(rmm_db) - return buf + mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) + return as_buffer(mask) @acquire_spill_lock() @@ -68,22 +44,15 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): if not isinstance(mask_buffer, cudf.core.buffer.Buffer): raise TypeError("mask_buffer is not an instance of " "cudf.core.buffer.Buffer") - cdef bitmask_type* bit_mask = ( - mask_buffer.get_ptr(mode="read") + plc_column = plc_transform.mask_to_bools( + mask_buffer.get_ptr(mode="read"), begin_bit, end_bit ) - - cdef unique_ptr[column] result - with nogil: - result = move( - libcudf_transform.mask_to_bools(bit_mask, begin_bit, end_bit) - ) - - return Column.from_unique_ptr(move(result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() def nans_to_nulls(Column input): - (mask, _) = plc_transform.nans_to_nulls( + mask, _ = plc_transform.nans_to_nulls( input.to_pylibcudf(mode="read") ) return as_buffer(mask) @@ -91,80 +60,45 @@ def nans_to_nulls(Column input): @acquire_spill_lock() def transform(Column input, op): - cdef column_view c_input = input.view() - cdef string c_str - cdef type_id c_tid - cdef data_type c_dtype - nb_type = numpy_support.from_dtype(input.dtype) nb_signature = (nb_type,) compiled_op = cudautils.compile_udf(op, nb_signature) - c_str = compiled_op[0].encode('UTF-8') np_dtype = cudf.dtype(compiled_op[1]) - try: - c_tid = ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - np_dtype - ] - ) - c_dtype = data_type(c_tid) - - except KeyError: - raise TypeError( - "Result of window function has unsupported dtype {}" - .format(np_dtype) - ) - - with nogil: - c_output = move(libcudf_transform.transform( - c_input, - c_str, - c_dtype, - True - )) - - return Column.from_unique_ptr(move(c_output)) + plc_column = plc_transform.transform( + input.to_pylibcudf(mode="read"), + compiled_op[0], + plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), + True + ) + return Column.from_pylibcudf(plc_column) def table_encode(list source_columns): - cdef table_view c_input = table_view_from_columns(source_columns) - cdef pair[unique_ptr[table], unique_ptr[column]] c_result - - with nogil: - c_result = move(libcudf_transform.encode(c_input)) + plc_table, plc_column = plc_transform.encode( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) + ) return ( - columns_from_unique_ptr(move(c_result.first)), - Column.from_unique_ptr(move(c_result.second)) + [Column.from_pylibcudf(col) for col in plc_table.columns()], + Column.from_pylibcudf(plc_column) ) def one_hot_encode(Column input_column, Column categories): - cdef column_view c_view_input = input_column.view() - cdef column_view c_view_categories = categories.view() - cdef pair[unique_ptr[column], table_view] c_result - - with nogil: - c_result = move( - libcudf_transform.one_hot_encode(c_view_input, c_view_categories) - ) - - # Notice, the data pointer of `owner` has been exposed - # through `c_result.second` at this point. - owner = Column.from_unique_ptr( - move(c_result.first), data_ptr_exposed=True - ) - - pylist_categories = categories.to_arrow().to_pylist() - encodings, _ = data_from_table_view( - move(c_result.second), - owner=owner, - column_names=[ - x if x is not None else '' for x in pylist_categories - ] + plc_table = plc_transform.one_hot_encode( + input_column.to_pylibcudf(mode="read"), + categories.to_pylibcudf(mode="read"), ) - return encodings + result_columns = [ + Column.from_pylibcudf(col, data_ptr_exposed=True) + for col in plc_table.columns() + ] + result_labels = [ + x if x is not None else '' + for x in categories.to_arrow().to_pylist() + ] + return dict(zip(result_labels, result_columns)) @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index cae28d02ef4..8660cca9322 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -49,9 +49,9 @@ cdef table_view table_view_from_table(tbl, ignore_index=False) except*: If True, don't include the index in the columns. """ return table_view_from_columns( - tbl._index._data.columns + tbl._data.columns + tbl._index._columns + tbl._columns if not ignore_index and tbl._index is not None - else tbl._data.columns + else tbl._columns ) @@ -62,7 +62,7 @@ cpdef generate_pandas_metadata(table, index): index_descriptors = [] columns_to_convert = list(table._columns) # Columns - for name, col in table._data.items(): + for name, col in table._column_labels_and_values: if cudf.get_option("mode.pandas_compatible"): # in pandas-compat mode, non-string column names are stringified. col_names.append(str(name)) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ff114474aa4..a6abd63d042 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1951,7 +1951,7 @@ def drop_duplicates( return self._from_columns_like_self( drop_duplicates( list(self._columns), - keys=range(len(self._data)), + keys=range(len(self._columns)), keep=keep, nulls_are_equal=nulls_are_equal, ), diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 16e6908f308..4463e3280df 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -623,11 +623,9 @@ def extract( "unsupported value for `flags` parameter" ) - data, _ = libstrings.extract(self._column, pat, flags) + data = libstrings.extract(self._column, pat, flags) if len(data) == 1 and expand is False: - data = next(iter(data.values())) - else: - data = data + _, data = data.popitem() return self._return_or_inplace(data, expand=expand) def contains( @@ -5349,6 +5347,76 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint32. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + >>> ls.str.word_minhash(seeds=seeds) + 0 [21141582, 1232889953, 1268336794] + 1 [962346254, 2321233602, 1354839212] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint32, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash(self._column, seeds_column) + ) + + def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: + """ + Compute the minhash of a list column of strings. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + This function generates 2 uint64 values but only the first + uint64 value is used. + + Parameters + ---------- + seeds : ColumnLike + The seeds used for the hash algorithm. + Must be of type uint64. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + >>> ls.str.word_minhash64(seeds) + 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] + 1 [5240044617220523711, 5847101123925041457, 153762819128779913] + dtype: list + """ + if seeds is None: + seeds_column = column.as_column(0, dtype=np.uint64, length=1) + else: + seeds_column = column.as_column(seeds) + if seeds_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(seeds)}" + ) + return self._return_or_inplace( + libstrings.word_minhash64(self._column, seeds_column) + ) + def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: """ Compute the Jaccard index between this column and the given diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 09b0f453692..bc093fdaa9a 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -151,9 +151,9 @@ def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None: self.set_by_label(key, value) def __delitem__(self, key: abc.Hashable) -> None: - old_ncols = len(self._data) + old_ncols = len(self) del self._data[key] - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def __len__(self) -> int: @@ -213,7 +213,7 @@ def level_names(self) -> tuple[abc.Hashable, ...]: @property def nlevels(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 if not self.multiindex: return 1 @@ -226,7 +226,7 @@ def name(self) -> abc.Hashable: @cached_property def nrows(self) -> int: - if len(self._data) == 0: + if len(self) == 0: return 0 else: return len(next(iter(self.values()))) @@ -257,9 +257,9 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: Parameters ---------- old_ncols: int - len(self._data) before self._data was modified + len(self) before self._data was modified new_ncols: int - len(self._data) after self._data was modified + len(self) after self._data was modified """ cached_properties = ("columns", "names", "_grouped_data") for attr in cached_properties: @@ -335,7 +335,7 @@ def insert( if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") - old_ncols = len(self._data) + old_ncols = len(self) if loc == -1: loc = old_ncols elif not (0 <= loc <= old_ncols): @@ -414,7 +414,7 @@ def get_labels_by_index(self, index: Any) -> tuple: tuple """ if isinstance(index, slice): - start, stop, step = index.indices(len(self._data)) + start, stop, step = index.indices(len(self)) return self.names[start:stop:step] elif pd.api.types.is_integer(index): return (self.names[index],) @@ -526,9 +526,9 @@ def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None: if len(self) > 0 and len(value) != self.nrows: raise ValueError("All columns must be of equal length") - old_ncols = len(self._data) + old_ncols = len(self) self._data[key] = value - new_ncols = len(self._data) + new_ncols = len(self) self._clear_cache(old_ncols, new_ncols) def _select_by_label_list_like(self, key: tuple) -> Self: @@ -718,12 +718,12 @@ def droplevel(self, level: int) -> None: if level < 0: level += self.nlevels - old_ncols = len(self._data) + old_ncols = len(self) self._data = { _remove_key_level(key, level): value # type: ignore[arg-type] for key, value in self._data.items() } - new_ncols = len(self._data) + new_ncols = len(self) self._level_names = ( self._level_names[:level] + self._level_names[level + 1 :] ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 58a16a6d504..16b0aa95c35 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -176,7 +176,7 @@ def _can_downcast_to_series(self, df, arg): return False @_performance_tracking - def _downcast_to_series(self, df, arg): + def _downcast_to_series(self, df: DataFrame, arg): """ "Downcast" from a DataFrame to a Series based on Pandas indexing rules @@ -203,16 +203,16 @@ def _downcast_to_series(self, df, arg): # take series along the axis: if axis == 1: - return df[df._data.names[0]] + return df[df._column_names[0]] else: if df._num_columns > 0: dtypes = df.dtypes.values.tolist() normalized_dtype = np.result_type(*dtypes) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: df[name] = col.astype(normalized_dtype) sr = df.T - return sr[sr._data.names[0]] + return sr[sr._column_names[0]] class _DataFrameLocIndexer(_DataFrameIndexer): @@ -258,7 +258,7 @@ def _getitem_tuple_arg(self, arg): and len(arg) > 1 and is_scalar(arg[1]) ): - return result._data.columns[0].element_indexing(0) + return result._columns[0].element_indexing(0) return result else: if isinstance(arg[0], slice): @@ -310,7 +310,7 @@ def _getitem_tuple_arg(self, arg): else: tmp_col_name = str(uuid4()) cantor_name = "_" + "_".join( - map(str, columns_df._data.names) + map(str, columns_df._column_names) ) if columns_df._data.multiindex: # column names must be appropriate length tuples @@ -1412,7 +1412,7 @@ def __setitem__(self, arg, value): else column.column_empty_like( col, masked=True, newsize=length ) - for key, col in self._data.items() + for key, col in self._column_labels_and_values ) self._data = self._data._from_columns_like_self( new_columns, verify=False @@ -1494,8 +1494,8 @@ def __delitem__(self, name): @_performance_tracking def memory_usage(self, index=True, deep=False) -> cudf.Series: - mem_usage = [col.memory_usage for col in self._data.columns] - names = [str(name) for name in self._data.names] + mem_usage = [col.memory_usage for col in self._columns] + names = [str(name) for name in self._column_names] if index: mem_usage.append(self.index.memory_usage()) names.append("Index") @@ -1725,7 +1725,7 @@ def _concat( [] if are_all_range_index or (ignore_index and not empty_has_index) - else list(f.index._data.columns) + else list(f.index._columns) ) + [f._data[name] if name in f._data else None for name in names] for f in objs @@ -1808,7 +1808,7 @@ def _concat( out.index.dtype, cudf.CategoricalDtype ): out = out.set_index(out.index) - for name, col in out._data.items(): + for name, col in out._column_labels_and_values: out._data[name] = col._with_type_metadata( tables[0]._data[name].dtype ) @@ -1831,13 +1831,13 @@ def astype( errors: Literal["raise", "ignore"] = "raise", ): if is_dict_like(dtype): - if len(set(dtype.keys()) - set(self._data.names)) > 0: + if len(set(dtype.keys()) - set(self._column_names)) > 0: raise KeyError( "Only a column name can be used for the " "key in a dtype mappings argument." ) else: - dtype = {cc: dtype for cc in self._data.names} + dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) def _clean_renderable_dataframe(self, output): @@ -2601,7 +2601,7 @@ def equals(self, other) -> bool: # If all other checks matched, validate names. if ret: for self_name, other_name in zip( - self._data.names, other._data.names + self._column_names, other._column_names ): if self_name != other_name: ret = False @@ -2676,7 +2676,7 @@ def columns(self, columns): ) self._data = ColumnAccessor( - data=dict(zip(pd_columns, self._data.columns)), + data=dict(zip(pd_columns, self._columns)), multiindex=multiindex, level_names=level_names, label_dtype=label_dtype, @@ -2698,7 +2698,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None: f"got {len(self)} elements" ) self._data = ColumnAccessor( - data=dict(zip(other.names, self._data.columns)), + data=dict(zip(other.names, self._columns)), multiindex=other.multiindex, rangeindex=other.rangeindex, level_names=other.level_names, @@ -2983,7 +2983,7 @@ def set_index( elif isinstance(col, (MultiIndex, pd.MultiIndex)): if isinstance(col, pd.MultiIndex): col = MultiIndex.from_pandas(col) - data_to_add.extend(col._data.columns) + data_to_add.extend(col._columns) names.extend(col.names) elif isinstance( col, (cudf.Series, cudf.Index, pd.Series, pd.Index) @@ -3110,7 +3110,9 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None): ) out = [] - for (name, col), other_col in zip(self._data.items(), other_cols): + for (name, col), other_col in zip( + self._column_labels_and_values, other_cols + ): source_col, other_col = _check_and_cast_columns_with_other( source_col=col, other=other_col, @@ -3314,7 +3316,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): column.column_empty_like( col_data, masked=True, newsize=length ) - for col_data in self._data.values() + for col_data in self._columns ), verify=False, ) @@ -3664,7 +3666,7 @@ def rename( name: col.find_and_replace( to_replace, vals, is_all_na ) - for name, col in self.index._data.items() + for name, col in self.index._column_labels_and_values } ) except OverflowError: @@ -3686,9 +3688,7 @@ def add_prefix(self, prefix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - prefix + col_name for col_name in list(self._data.keys()) - ] + out.columns = [prefix + col_name for col_name in self._column_names] return out @_performance_tracking @@ -3697,9 +3697,7 @@ def add_suffix(self, suffix, axis=None): raise NotImplementedError("axis is currently not implemented.") # TODO: Change to deep=False when copy-on-write is default out = self.copy(deep=True) - out.columns = [ - col_name + suffix for col_name in list(self._data.keys()) - ] + out.columns = [col_name + suffix for col_name in self._column_names] return out @_performance_tracking @@ -4805,7 +4803,7 @@ def _func(x): # pragma: no cover # TODO: naive implementation # this could be written as a single kernel result = {} - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: apply_sr = Series._from_column(col) result[name] = apply_sr.apply(_func)._column @@ -5444,7 +5442,7 @@ def to_pandas( out_index = self.index.to_pandas() out_data = { i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) - for i, col in enumerate(self._data.columns) + for i, col in enumerate(self._columns) } out_df = pd.DataFrame(out_data, index=out_index) @@ -5665,14 +5663,16 @@ def to_arrow(self, preserve_index=None) -> pa.Table: index = index._as_int_index() index.name = "__index_level_0__" if isinstance(index, MultiIndex): - index_descr = list(index._data.names) + index_descr = index._column_names index_levels = index.levels else: index_descr = ( index.names if index.name is not None else ("index",) ) data = data.copy(deep=False) - for gen_name, col_name in zip(index_descr, index._data.names): + for gen_name, col_name in zip( + index_descr, index._column_names + ): data._insert( data.shape[1], gen_name, @@ -5681,7 +5681,7 @@ def to_arrow(self, preserve_index=None) -> pa.Table: out = super(DataFrame, data).to_arrow() metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[self[col] for col in self._data.names], + columns_to_convert=[self[col] for col in self._column_names], df=self, column_names=out.schema.names, index_levels=index_levels, @@ -5724,12 +5724,12 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): "column_dtypes is currently not supported." ) members = [("index", self.index.dtype)] if index else [] - members += [(col, self[col].dtype) for col in self._data.names] + members += list(self._dtypes) dtype = np.dtype(members) ret = np.recarray(len(self), dtype=dtype) if index: ret["index"] = self.index.to_numpy() - for col in self._data.names: + for col in self._column_names: ret[col] = self[col].to_numpy() return ret @@ -6059,7 +6059,7 @@ def quantile( ) if columns is None: - columns = data_df._data.names + columns = set(data_df._column_names) if isinstance(q, numbers.Number): q_is_number = True @@ -6084,7 +6084,7 @@ def quantile( # Ensure that qs is non-scalar so that we always get a column back. interpolation = interpolation or "linear" result = {} - for k in data_df._data.names: + for k in data_df._column_names: if k in columns: ser = data_df[k] res = ser.quantile( @@ -6198,7 +6198,7 @@ def make_false_column_like_self(): if isinstance(values, DataFrame) else {name: values._column for name in self._data} ) - for col, self_col in self._data.items(): + for col, self_col in self._column_labels_and_values: if col in other_cols: other_col = other_cols[col] self_is_cat = isinstance(self_col, CategoricalColumn) @@ -6231,13 +6231,13 @@ def make_false_column_like_self(): else: result[col] = make_false_column_like_self() elif is_dict_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: if name in values: result[name] = col.isin(values[name]) else: result[name] = make_false_column_like_self() elif is_list_like(values): - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: result[name] = col.isin(values) else: raise TypeError( @@ -6292,7 +6292,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only): name: filtered._data[name]._get_mask_as_column() if filtered._data[name].nullable else as_column(True, length=len(filtered._data[name])) - for name in filtered._data.names + for name in filtered._column_names } ) mask = mask.all(axis=1) @@ -6342,7 +6342,7 @@ def count(self, axis=0, numeric_only=False): length = len(self) return Series._from_column( as_column([length - col.null_count for col in self._columns]), - index=cudf.Index(self._data.names), + index=cudf.Index(self._column_names), ) _SUPPORT_AXIS_LOOKUP = { @@ -6409,7 +6409,7 @@ def _reduce( return source._apply_cupy_method_axis_1(op, **kwargs) else: axis_0_results = [] - for col_label, col in source._data.items(): + for col_label, col in source._column_labels_and_values: try: axis_0_results.append(getattr(col, op)(**kwargs)) except AttributeError as err: @@ -6634,7 +6634,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared, mask, common_dtype = self._prepare_for_rowwise_op( method, skipna, numeric_only ) - for col in prepared._data.names: + for col in prepared._column_names: if prepared._data[col].nullable: prepared._data[col] = ( prepared._data[col] @@ -6820,7 +6820,7 @@ def select_dtypes(self, include=None, exclude=None): # remove all exclude types inclusion = inclusion - exclude_subtypes - for k, col in self._data.items(): + for k, col in self._column_labels_and_values: infered_type = cudf_dtype_from_pydata_dtype(col.dtype) if infered_type in inclusion: df._insert(len(df._data), k, col) @@ -6840,7 +6840,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -7192,7 +7192,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Compute the column indices that serves as the input for # `interleave_columns` column_idx_df = pd.DataFrame( - data=range(len(self._data)), index=named_levels + data=range(self._num_columns), index=named_levels ) column_indices: list[list[int]] = [] @@ -7392,17 +7392,17 @@ def to_struct(self, name=None): ----- Note: a copy of the columns is made. """ - if not all(isinstance(name, str) for name in self._data.names): + if not all(isinstance(name, str) for name in self._column_names): warnings.warn( "DataFrame contains non-string column name(s). Struct column " "requires field name to be string. Non-string column names " "will be casted to string as the field name." ) - fields = {str(name): col.dtype for name, col in self._data.items()} + fields = {str(name): dtype for name, dtype in self._dtypes} col = StructColumn( data=None, dtype=cudf.StructDtype(fields=fields), - children=tuple(col.copy(deep=True) for col in self._data.columns), + children=tuple(col.copy(deep=True) for col in self._columns), size=len(self), offset=0, ) @@ -7984,7 +7984,7 @@ def value_counts( diff = set(subset) - set(self._data) if len(diff) != 0: raise KeyError(f"columns {diff} do not exist") - columns = list(self._data.names) if subset is None else subset + columns = list(self._column_names) if subset is None else subset result = ( self.groupby( by=columns, @@ -8105,7 +8105,7 @@ def func(left, right, output): right._column_names ) elif _is_scalar_or_zero_d_array(right): - for name, col in output._data.items(): + for name, col in output._column_labels_and_values: output._data[name] = col.fillna(value) return output else: @@ -8387,7 +8387,7 @@ def extract_col(df, col): and col not in df.index._data and not isinstance(df.index, MultiIndex) ): - return df.index._data.columns[0] + return df.index._column return df.index._data[col] diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 7b2bc85b13b..37ad6b8fabb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -75,8 +75,15 @@ def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property - def _dtypes(self) -> abc.Iterable: - return zip(self._data.names, (col.dtype for col in self._data.columns)) + def _column_labels_and_values( + self, + ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]: + return zip(self._column_names, self._columns) + + @property + def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]: + for label, col in self._column_labels_and_values: + yield label, col.dtype @property def ndim(self) -> int: @@ -87,7 +94,7 @@ def serialize(self): # TODO: See if self._data can be serialized outright header = { "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(tuple(self._data.names)), + "column_names": pickle.dumps(self._column_names), "column_rangeindex": pickle.dumps(self._data.rangeindex), "column_multiindex": pickle.dumps(self._data.multiindex), "column_label_dtype": pickle.dumps(self._data.label_dtype), @@ -156,7 +163,7 @@ def _mimic_inplace( self, result: Self, inplace: bool = False ) -> Self | None: if inplace: - for col in self._data: + for col in self._column_names: if col in result._data: self._data[col]._mimic_inplace( result._data[col], inplace=True @@ -267,7 +274,7 @@ def __len__(self) -> int: def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: casted = ( col.astype(dtype.get(col_name, col.dtype), copy=copy) - for col_name, col in self._data.items() + for col_name, col in self._column_labels_and_values ) ca = self._data._from_columns_like_self(casted, verify=False) return self._from_data_like_self(ca) @@ -338,9 +345,7 @@ def equals(self, other) -> bool: return all( self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip( - self._data.values(), other._data.values() - ) + for self_col, other_col in zip(self._columns, other._columns) ) @_performance_tracking @@ -434,11 +439,9 @@ def to_array( if dtype is None: if ncol == 1: - dtype = next(iter(self._data.values())).dtype + dtype = next(self._dtypes)[1] else: - dtype = find_common_type( - [col.dtype for col in self._data.values()] - ) + dtype = find_common_type([dtype for _, dtype in self._dtypes]) if not isinstance(dtype, numpy.dtype): raise NotImplementedError( @@ -446,12 +449,12 @@ def to_array( ) if self.ndim == 1: - return to_array(self._data.columns[0], dtype) + return to_array(self._columns[0], dtype) else: matrix = module.empty( shape=(len(self), ncol), dtype=dtype, order="F" ) - for i, col in enumerate(self._data.values()): + for i, col in enumerate(self._columns): # TODO: col.values may fail if there is nullable data or an # unsupported dtype. We may want to catch and provide a more # suitable error. @@ -751,7 +754,7 @@ def fillna( filled_columns = [ col.fillna(value[name], method) if name in value else col.copy() - for name, col in self._data.items() + for name, col in self._column_labels_and_values ] return self._mimic_inplace( @@ -764,11 +767,15 @@ def fillna( ) @_performance_tracking - def _drop_column(self, name): - """Drop a column by *name*""" - if name not in self._data: - raise KeyError(f"column '{name}' does not exist") - del self._data[name] + def _drop_column( + self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" + ) -> None: + """Drop a column by *name* inplace.""" + try: + del self._data[name] + except KeyError as err: + if errors != "ignore": + raise KeyError(f"column '{name}' does not exist") from err @_performance_tracking def _quantile_table( @@ -988,7 +995,10 @@ def to_arrow(self): index: [[1,2,3]] """ return pa.Table.from_pydict( - {str(name): col.to_arrow() for name, col in self._data.items()} + { + str(name): col.to_arrow() + for name, col in self._column_labels_and_values + } ) @_performance_tracking @@ -1012,7 +1022,9 @@ def _copy_type_metadata(self: Self, other: Self) -> Self: See `ColumnBase._with_type_metadata` for more information. """ - for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes): + for (name, col), (_, dtype) in zip( + self._column_labels_and_values, other._dtypes + ): self._data.set_by_label(name, col._with_type_metadata(dtype)) return self @@ -1422,7 +1434,7 @@ def _split(self, splits): """ return [ self._from_columns_like_self( - libcudf.copying.columns_split([*self._data.columns], splits)[ + libcudf.copying.columns_split(list(self._columns), splits)[ split_idx ], self._column_names, @@ -1432,7 +1444,7 @@ def _split(self, splits): @_performance_tracking def _encode(self): - columns, indices = libcudf.transform.table_encode([*self._columns]) + columns, indices = libcudf.transform.table_encode(list(self._columns)) keys = self._from_columns_like_self(columns) return keys, indices @@ -1578,7 +1590,7 @@ def __neg__(self): col.unary_operator("not") if col.dtype.kind == "b" else -1 * col - for col in self._data.columns + for col in self._columns ) ) ) @@ -1840,9 +1852,7 @@ def __copy__(self): def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( - self._data._from_columns_like_self( - (~col for col in self._data.columns) - ) + self._data._from_columns_like_self((~col for col in self._columns)) ) @_performance_tracking diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6424c8af877..cb8cd0cd28b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -751,10 +751,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): # Even with `sort=False`, pandas guarantees that # groupby preserves the order of rows within each group. - left_cols = list( - self.grouping.keys.drop_duplicates()._data.columns - ) - right_cols = list(result_index._data.columns) + left_cols = list(self.grouping.keys.drop_duplicates()._columns) + right_cols = list(result_index._columns) join_keys = [ _match_join_keys(lcol, rcol, "left") for lcol, rcol in zip(left_cols, right_cols) @@ -1483,7 +1481,7 @@ def _post_process_chunk_results( # the column name should be, especially if we applied # a nameless UDF. result = result.to_frame( - name=grouped_values._data.names[0] + name=grouped_values._column_names[0] ) else: index_data = group_keys._data.copy(deep=True) @@ -1632,7 +1630,7 @@ def mult(df): if func in {"sum", "product"}: # For `sum` & `product`, boolean types # will need to result in `int64` type. - for name, col in res._data.items(): + for name, col in res._column_labels_and_values: if col.dtype.kind == "b": res._data[name] = col.astype("int") return res @@ -2715,11 +2713,8 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): def _reduce_numeric_only(self, op: str): columns = list( name - for name in self.obj._data.names - if ( - is_numeric_dtype(self.obj._data[name].dtype) - and name not in self.grouping.names - ) + for name, dtype in self.obj._dtypes + if (is_numeric_dtype(dtype) and name not in self.grouping.names) ) return self[columns].agg(op) @@ -3209,7 +3204,7 @@ def values(self) -> cudf.core.frame.Frame: """ # If the key columns are in `obj`, filter them out value_column_names = [ - x for x in self._obj._data.names if x not in self._named_columns + x for x in self._obj._column_names if x not in self._named_columns ] value_columns = self._obj._data.select_by_label(value_column_names) return self._obj.__class__._from_data(value_columns) @@ -3224,8 +3219,8 @@ def _handle_series(self, by): self.names.append(by.name) def _handle_index(self, by): - self._key_columns.extend(by._data.columns) - self.names.extend(by._data.names) + self._key_columns.extend(by._columns) + self.names.extend(by._column_names) def _handle_mapping(self, by): by = cudf.Series(by.values(), index=by.keys()) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b2bd20c4982..cd07c58c5d9 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -122,13 +122,13 @@ def _lexsorted_equal_range( sort_inds = None sort_vals = idx lower_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="left", ascending=sort_vals.is_monotonic_increasing, ).element_indexing(0) upper_bound = search_sorted( - [*sort_vals._data.columns], + list(sort_vals._columns), keys, side="right", ascending=sort_vals.is_monotonic_increasing, @@ -286,6 +286,20 @@ def name(self): def name(self, value): self._name = value + @property + @_performance_tracking + def _column_names(self) -> tuple[Any]: + return (self.name,) + + @property + @_performance_tracking + def _columns(self) -> tuple[ColumnBase]: + return (self._values,) + + @property + def _column_labels_and_values(self) -> Iterable: + return zip(self._column_names, self._columns) + @property # type: ignore @_performance_tracking def start(self) -> int: @@ -1068,7 +1082,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } data = self._apply_cupy_ufunc_to_operands( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fd6bf37f0e6..5952815deef 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,7 +3,6 @@ from __future__ import annotations -import numbers import operator import textwrap import warnings @@ -150,24 +149,14 @@ ) -def _get_host_unique(array): +def _get_unique_drop_labels(array): + """Return labels to be dropped for IndexFrame.drop.""" if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)): - return array.unique.to_pandas() - elif isinstance(array, (str, numbers.Number)): - return [array] + yield from np.unique(as_column(array).values_host) + elif is_scalar(array): + yield array else: - return set(array) - - -def _drop_columns(f: Frame, columns: abc.Iterable, errors: str): - for c in columns: - try: - f._drop_column(c) - except KeyError as e: - if errors == "ignore": - pass - else: - raise e + yield from set(array) def _indices_from_labels(obj, labels): @@ -294,7 +283,7 @@ def _num_rows(self) -> int: @property def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? - return self.index._data.names + return self.index._column_names @classmethod def _from_data( @@ -307,6 +296,7 @@ def _from_data( raise ValueError( f"index must be None or a cudf.Index not {type(index).__name__}" ) + # out._num_rows requires .index to be defined out._index = RangeIndex(out._data.nrows) if index is None else index return out @@ -882,7 +872,7 @@ def replace( columns_dtype_map=dict(self._dtypes), ) copy_data = [] - for name, col in self._data.items(): + for name, col in self._column_labels_and_values: try: replaced = col.find_and_replace( to_replace_per_column[name], @@ -2703,11 +2693,11 @@ def sort_index( by.extend( filter( lambda n: n not in handled, - self.index._data.names, + self.index._column_names, ) ) else: - by = list(idx._data.names) + by = list(idx._column_names) inds = idx._get_sorted_inds( by=by, ascending=ascending, na_position=na_position @@ -3013,7 +3003,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: columns_to_slice = [ *( - self.index._data.columns + self.index._columns if keep_index and not has_range_index else [] ), @@ -3210,7 +3200,7 @@ def _empty_like(self, keep_index=True) -> Self: result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ - *(self.index._data.columns if keep_index else ()), + *(self.index._columns if keep_index else ()), *self._columns, ] ), @@ -3227,7 +3217,7 @@ def _split(self, splits, keep_index=True): columns_split = libcudf.copying.columns_split( [ - *(self.index._data.columns if keep_index else []), + *(self.index._columns if keep_index else []), *self._columns, ], splits, @@ -3763,8 +3753,8 @@ def _reindex( idx_dtype_match = (df.index.nlevels == index.nlevels) and all( _is_same_dtype(left_dtype, right_dtype) for left_dtype, right_dtype in zip( - (col.dtype for col in df.index._data.columns), - (col.dtype for col in index._data.columns), + (dtype for _, dtype in df.index._dtypes), + (dtype for _, dtype in index._dtypes), ) ) @@ -3783,7 +3773,7 @@ def _reindex( (name or 0) if isinstance(self, cudf.Series) else name: col - for name, col in df._data.items() + for name, col in df._column_labels_and_values }, index=df.index, ) @@ -3794,7 +3784,7 @@ def _reindex( index = index if index is not None else df.index if column_names is None: - names = list(df._data.names) + names = list(df._column_names) level_names = self._data.level_names multiindex = self._data.multiindex rangeindex = self._data.rangeindex @@ -3948,7 +3938,7 @@ def round(self, decimals=0, how="half_even"): col.round(decimals[name], how=how) if name in decimals and col.dtype.kind in "fiu" else col.copy(deep=True) - for name, col in self._data.items() + for name, col in self._column_labels_and_values ) return self._from_data_like_self( self._data._from_columns_like_self(cols) @@ -4270,7 +4260,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): else: thresh = len(df) - for name, col in df._data.items(): + for name, col in df._column_labels_and_values: check_col = col.nans_to_nulls() no_threshold_valid_count = ( len(col) - check_col.null_count @@ -4305,7 +4295,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( - [*self.index._data.columns, *data_columns], + [*self.index._columns, *data_columns], how=how, keys=self._positions_from_column_names( subset, offset_by_index_columns=True @@ -4853,7 +4843,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # This works for Index too inputs = { name: (col, None, False, None) - for name, col in self._data.items() + for name, col in self._column_labels_and_values } index = self.index @@ -4933,7 +4923,7 @@ def repeat(self, repeats, axis=None): """ res = self._from_columns_like_self( Frame._repeat( - [*self.index._data.columns, *self._columns], repeats, axis + [*self.index._columns, *self._columns], repeats, axis ), self._column_names, self._index_names, @@ -5261,15 +5251,14 @@ def drop( out = self.copy() if axis in (1, "columns"): - target = _get_host_unique(target) - - _drop_columns(out, target, errors) + for label in _get_unique_drop_labels(target): + out._drop_column(label, errors=errors) elif axis in (0, "index"): dropped = _drop_rows_by_labels(out, target, level, errors) if columns is not None: - columns = _get_host_unique(columns) - _drop_columns(dropped, columns, errors) + for label in _get_unique_drop_labels(columns): + dropped._drop_column(label, errors=errors) out._mimic_inplace(dropped, inplace=True) @@ -6224,7 +6213,7 @@ def _preprocess_subset(self, subset): not np.iterable(subset) or isinstance(subset, str) or isinstance(subset, tuple) - and subset in self._data.names + and subset in self._column_names ): subset = (subset,) diff = set(subset) - set(self._data) @@ -6306,8 +6295,8 @@ def rank( ) numeric_cols = ( name - for name in self._data.names - if _is_non_decimal_numeric_dtype(self._data[name]) + for name, dtype in self._dtypes + if _is_non_decimal_numeric_dtype(dtype) ) source = self._get_columns_by_label(numeric_cols) if source.empty: diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index b65bc7af832..cfeaca00888 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -140,11 +140,15 @@ def __init__( # right_on. self._using_left_index = bool(left_index) left_on = ( - lhs.index._data.names if left_index else left_on if left_on else on + lhs.index._column_names + if left_index + else left_on + if left_on + else on ) self._using_right_index = bool(right_index) right_on = ( - rhs.index._data.names + rhs.index._column_names if right_index else right_on if right_on @@ -334,18 +338,18 @@ def _merge_results( # All columns from the left table make it into the output. Non-key # columns that share a name with a column in the right table are # suffixed with the provided suffix. - common_names = set(left_result._data.names) & set( - right_result._data.names + common_names = set(left_result._column_names) & set( + right_result._column_names ) cols_to_suffix = common_names - self._key_columns_with_same_name data = { (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col - for name, col in left_result._data.items() + for name, col in left_result._column_labels_and_values } # The right table follows the same rule as the left table except that # key columns from the right table are removed. - for name, col in right_result._data.items(): + for name, col in right_result._column_labels_and_values: if name in common_names: if name not in self._key_columns_with_same_name: data[f"{name}{self.rsuffix}"] = col @@ -399,7 +403,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # producing the input result. by: list[Any] = [] if self._using_left_index and self._using_right_index: - by.extend(result.index._data.columns) + by.extend(result.index._columns) if not self._using_left_index: by.extend([result._data[col.name] for col in self._left_keys]) if not self._using_right_index: diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e00890ac5c3..6de3981ba66 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -36,7 +36,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator + from collections.abc import Generator, Hashable from typing_extensions import Self @@ -233,8 +233,8 @@ def names(self, value): # to unexpected behavior in some cases. This is # definitely buggy, but we can't disallow non-unique # names either... - self._data = self._data.__class__( - dict(zip(value, self._data.values())), + self._data = type(self._data)( + dict(zip(value, self._columns)), level_names=self._data.level_names, verify=False, ) @@ -693,19 +693,25 @@ def where(self, cond, other=None, inplace=False): @_performance_tracking def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" - lookup = cudf.DataFrame() + lookup_dict = {} for i, row in enumerate(row_tuple): if isinstance(row, slice) and row == slice(None): continue - lookup[i] = cudf.Series(row) - frame = cudf.DataFrame(dict(enumerate(index._data.columns))) + lookup_dict[i] = row + lookup = cudf.DataFrame(lookup_dict) + frame = cudf.DataFrame._from_data( + ColumnAccessor(dict(enumerate(index._columns)), verify=False) + ) with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) data_table = cudf.concat( [ frame, cudf.DataFrame._from_data( - {"idx": column.as_column(range(len(frame)))} + ColumnAccessor( + {"idx": column.as_column(range(len(frame)))}, + verify=False, + ) ), ], axis=1, @@ -716,7 +722,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): # TODO: Remove this after merge/join # obtain deterministic ordering. if cudf.get_option("mode.pandas_compatible"): - lookup_order = "_" + "_".join(map(str, lookup._data.names)) + lookup_order = "_" + "_".join(map(str, lookup._column_names)) lookup[lookup_order] = column.as_column(range(len(lookup))) postprocess = operator.methodcaller( "sort_values", by=[lookup_order, "idx"] @@ -784,7 +790,7 @@ def _index_and_downcast(self, result, index, index_key): out_index.insert( out_index._num_columns, k, - cudf.Series._from_column(index._data.columns[k]), + cudf.Series._from_column(index._columns[k]), ) # determine if we should downcast from a DataFrame to a Series @@ -800,19 +806,19 @@ def _index_and_downcast(self, result, index, index_key): ) if need_downcast: result = result.T - return result[result._data.names[0]] + return result[result._column_names[0]] if len(result) == 0 and not slice_access: # Pandas returns an empty Series with a tuple as name # the one expected result column result = cudf.Series._from_data( - {}, name=tuple(col[0] for col in index._data.columns) + {}, name=tuple(col[0] for col in index._columns) ) elif out_index._num_columns == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to that column's name. - *_, last_column = index._data.columns + last_column = index._columns[-1] out_index = cudf.Index._from_column( last_column, name=index.names[-1] ) @@ -894,7 +900,7 @@ def __eq__(self, other): [ self_col.equals(other_col) for self_col, other_col in zip( - self._data.values(), other._data.values() + self._columns, other._columns ) ] ) @@ -1041,9 +1047,11 @@ def to_frame( ) @_performance_tracking - def get_level_values(self, level) -> cudf.Index: + def _level_to_ca_label(self, level) -> tuple[Hashable, int]: """ - Return the values at the requested level + Convert a level to a ColumAccessor label and an integer position. + + Useful if self._column_names != self.names. Parameters ---------- @@ -1051,10 +1059,13 @@ def get_level_values(self, level) -> cudf.Index: Returns ------- - An Index containing the values at the requested level. + tuple[Hashable, int] + (ColumnAccessor label corresponding to level, integer position of the level) """ - colnames = self._data.names - if level not in colnames: + colnames = self._column_names + try: + level_idx = colnames.index(level) + except ValueError: if isinstance(level, int): if level < 0: level = level + len(colnames) @@ -1067,8 +1078,22 @@ def get_level_values(self, level) -> cudf.Index: level = colnames[level_idx] else: raise KeyError(f"Level not found: '{level}'") - else: - level_idx = colnames.index(level) + return level, level_idx + + @_performance_tracking + def get_level_values(self, level) -> cudf.Index: + """ + Return the values at the requested level + + Parameters + ---------- + level : int or label + + Returns + ------- + An Index containing the values at the requested level. + """ + level, level_idx = self._level_to_ca_label(level) level_values = cudf.Index._from_column( self._data[level], name=self.names[level_idx] ) @@ -1420,57 +1445,6 @@ def from_arrays( codes=codes, levels=levels, sortorder=sortorder, names=names ) - @_performance_tracking - def _poplevels(self, level) -> None | MultiIndex | cudf.Index: - """ - Remove and return the specified levels from self. - - Parameters - ---------- - level : level name or index, list - One or more levels to remove - - Returns - ------- - Index composed of the removed levels. If only a single level - is removed, a flat index is returned. If no levels are specified - (empty list), None is returned. - """ - if not pd.api.types.is_list_like(level): - level = (level,) - - ilevels = sorted(self._level_index_from_level(lev) for lev in level) - - if not ilevels: - return None - - popped_data = {} - popped_names = [] - names = list(self.names) - - # build the popped data and names - for i in ilevels: - n = self._data.names[i] - popped_data[n] = self._data[n] - popped_names.append(self.names[i]) - - # pop the levels out from self - # this must be done iterating backwards - for i in reversed(ilevels): - n = self._data.names[i] - names.pop(i) - popped_data[n] = self._data.pop(n) - - # construct the popped result - popped = cudf.core.index._index_from_data(popped_data) - popped.names = popped_names - - # update self - self.names = names - self._levels, self._codes = _compute_levels_and_codes(self._data) - - return popped - @_performance_tracking def swaplevel(self, i=-2, j=-1) -> Self: """ @@ -1507,10 +1481,10 @@ def swaplevel(self, i=-2, j=-1) -> Self: ('aa', 'b')], ) """ - name_i = self._data.names[i] if isinstance(i, int) else i - name_j = self._data.names[j] if isinstance(j, int) else j + name_i = self._column_names[i] if isinstance(i, int) else i + name_j = self._column_names[j] if isinstance(j, int) else j new_data = {} - for k, v in self._data.items(): + for k, v in self._column_labels_and_values: if k not in (name_i, name_j): new_data[k] = v elif k == name_i: @@ -1523,7 +1497,7 @@ def swaplevel(self, i=-2, j=-1) -> Self: return midx @_performance_tracking - def droplevel(self, level=-1) -> MultiIndex | cudf.Index: + def droplevel(self, level=-1) -> Self | cudf.Index: """ Removes the specified levels from the MultiIndex. @@ -1578,11 +1552,24 @@ def droplevel(self, level=-1) -> MultiIndex | cudf.Index: >>> idx.droplevel(["first", "second"]) Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') """ - mi = self.copy(deep=False) - mi._poplevels(level) - if mi.nlevels == 1: - return mi.get_level_values(mi.names[0]) + if is_scalar(level): + level = (level,) + elif len(level) == 0: + return self + + new_names = list(self.names) + new_data = self._data.copy(deep=False) + for i in sorted( + (self._level_index_from_level(lev) for lev in level), reverse=True + ): + new_names.pop(i) + new_data.pop(self._data.names[i]) + + if len(new_data) == 1: + return cudf.core.index._index_from_data(new_data) else: + mi = MultiIndex._from_data(new_data) + mi.names = new_names return mi @_performance_tracking @@ -1886,7 +1873,7 @@ def __array_function__(self, func, types, args, kwargs): else: return NotImplemented - def _level_index_from_level(self, level): + def _level_index_from_level(self, level) -> int: """ Return level index from given level name or index """ @@ -1935,7 +1922,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): join_keys = [ _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(target._data.columns, self._data.columns) + for lcol, rcol in zip(target._columns, self._columns) ] join_keys = map(list, zip(*join_keys)) scatter_map, indices = libcudf.join.join( @@ -2132,7 +2119,7 @@ def _split_columns_by_levels( lv if isinstance(lv, int) else level_names.index(lv) for lv in levels } - for i, (name, col) in enumerate(zip(self.names, self._data.columns)): + for i, (name, col) in enumerate(zip(self.names, self._columns)): if in_levels and i in level_indices: name = f"level_{i}" if name is None else name yield name, col @@ -2173,9 +2160,7 @@ def _columns_for_reset_index( ) -> Generator[tuple[Any, column.ColumnBase], None, None]: """Return the columns and column names for .reset_index""" if levels is None: - for i, (col, name) in enumerate( - zip(self._data.columns, self.names) - ): + for i, (col, name) in enumerate(zip(self._columns, self.names)): yield f"level_{i}" if name is None else name, col else: yield from self._split_columns_by_levels(levels, in_levels=True) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 3d205957126..401fef67ee6 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,6 +12,7 @@ from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default +from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column_accessor import ColumnAccessor @@ -409,7 +410,7 @@ def concat( result_columns = None if keys_objs is None: for o in objs: - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: if name in result_data: raise NotImplementedError( f"A Column with duplicate name found: {name}, cuDF " @@ -437,7 +438,7 @@ def concat( else: # All levels in the multiindex label must have the same type has_multiple_level_types = ( - len({type(name) for o in objs for name in o._data.keys()}) > 1 + len({type(name) for o in objs for name in o._column_names}) > 1 ) if has_multiple_level_types: raise NotImplementedError( @@ -446,7 +447,7 @@ def concat( "the labels to the same type." ) for k, o in zip(keys_objs, objs): - for name, col in o._data.items(): + for name, col in o._column_labels_and_values: # if only series, then only keep keys_objs as column labels # if the existing column is multiindex, prepend it # to handle cases where dfs and srs are concatenated @@ -738,7 +739,8 @@ def get_dummies( sparse : boolean, optional Right now this is NON-FUNCTIONAL argument in rapids. drop_first : boolean, optional - Right now this is NON-FUNCTIONAL argument in rapids. + Whether to get k-1 dummies out of k categorical levels by removing the + first level. columns : sequence of str, optional Names of columns to encode. If not provided, will attempt to encode all columns. Note this is different from pandas default behavior, which @@ -806,9 +808,6 @@ def get_dummies( if sparse: raise NotImplementedError("sparse is not supported yet") - if drop_first: - raise NotImplementedError("drop_first is not supported yet") - if isinstance(data, cudf.DataFrame): encode_fallback_dtypes = ["object", "category"] @@ -844,7 +843,7 @@ def get_dummies( else: result_data = { col_name: col - for col_name, col in data._data.items() + for col_name, col in data._column_labels_and_values if col_name not in columns } @@ -862,6 +861,7 @@ def get_dummies( prefix=prefix_map.get(name, prefix), prefix_sep=prefix_sep_map.get(name, prefix_sep), dtype=dtype, + drop_first=drop_first, ) result_data.update(col_enc_data) return cudf.DataFrame._from_data(result_data, index=data.index) @@ -874,6 +874,7 @@ def get_dummies( prefix=prefix, prefix_sep=prefix_sep, dtype=dtype, + drop_first=drop_first, ) return cudf.DataFrame._from_data(data, index=ser.index) @@ -942,7 +943,7 @@ def _merge_sorted( columns = [ [ - *(obj.index._data.columns if not ignore_index else ()), + *(obj.index._columns if not ignore_index else ()), *obj._columns, ] for obj in objs @@ -984,7 +985,7 @@ def as_tuple(x): return x if isinstance(x, tuple) else (x,) nrows = len(index_labels) - for col_label, col in df._data.items(): + for col_label, col in df._column_labels_and_values: names = [ as_tuple(col_label) + as_tuple(name) for name in column_labels ] @@ -1008,7 +1009,7 @@ def as_tuple(x): ca = ColumnAccessor( result, multiindex=True, - level_names=(None,) + columns._data.names, + level_names=(None,) + columns._column_names, verify=False, ) return cudf.DataFrame._from_data( @@ -1086,11 +1087,7 @@ def pivot(data, columns=None, index=no_default, values=no_default): # Create a DataFrame composed of columns from both # columns and index ca = ColumnAccessor( - dict( - enumerate( - itertools.chain(index._data.columns, columns._data.columns) - ) - ), + dict(enumerate(itertools.chain(index._columns, columns._columns))), verify=False, ) columns_index = cudf.DataFrame._from_data(ca) @@ -1227,13 +1224,24 @@ def unstack(df, level, fill_value=None, sort: bool = True): ) return res else: - df = df.copy(deep=False) - columns = df.index._poplevels(level) - index = df.index - result = _pivot(df, index, columns) - if result.index.nlevels == 1: - result.index = result.index.get_level_values(result.index.names[0]) - return result + index = df.index.droplevel(level) + if is_scalar(level): + columns = df.index.get_level_values(level) + else: + new_names = [] + ca_data = {} + for lev in level: + ca_level, level_idx = df.index._level_to_ca_label(lev) + new_names.append(df.index.names[level_idx]) + ca_data[ca_level] = df.index._data[ca_level] + columns = type(df.index)._from_data( + ColumnAccessor(ca_data, verify=False) + ) + columns.names = new_names + result = _pivot(df, index, columns) + if result.index.nlevels == 1: + result.index = result.index.get_level_values(result.index.names[0]) + return result def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase: @@ -1256,6 +1264,7 @@ def _one_hot_encode_column( prefix: str | None, prefix_sep: str | None, dtype: Dtype | None, + drop_first: bool, ) -> dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with @@ -1276,6 +1285,8 @@ def _one_hot_encode_column( ) data = one_hot_encode(column, categories) + if drop_first and len(data): + data.pop(next(iter(data))) if prefix is not None and prefix_sep is not None: data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()} if dtype: @@ -1545,7 +1556,7 @@ def pivot_table( if values_passed and not values_multi and table._data.multiindex: column_names = table._data.level_names[1:] table_columns = tuple( - map(lambda column: column[1:], table._data.names) + map(lambda column: column[1:], table._column_names) ) table.columns = pd.MultiIndex.from_tuples( tuples=table_columns, names=column_names diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 7197560b5a4..68f34fa28ff 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -186,7 +186,7 @@ def to_datetime( if isinstance(arg, cudf.DataFrame): # we require at least Ymd required = ["year", "month", "day"] - req = list(set(required) - set(arg._data.names)) + req = list(set(required) - set(arg._column_names)) if len(req): err_req = ",".join(req) raise ValueError( @@ -196,7 +196,7 @@ def to_datetime( ) # replace passed column name with values in _unit_map - got_units = {k: get_units(k) for k in arg._data.names} + got_units = {k: get_units(k) for k in arg._column_names} unit_rev = {v: k for k, v in got_units.items()} # keys we don't recognize diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py index 265b87350ae..3af662b62ea 100644 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ b/python/cudf/cudf/core/udf/groupby_utils.py @@ -210,7 +210,7 @@ def _can_be_jitted(frame, func, args): # See https://github.com/numba/numba/issues/4587 return False - if any(col.has_nulls() for col in frame._data.values()): + if any(col.has_nulls() for col in frame._columns): return False np_field_types = np.dtype( list( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 6d7362952c9..bfe716f0afc 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -126,25 +126,23 @@ def _get_udf_return_type(argty, func: Callable, args=()): def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - if str(col.dtype) in supported_types - else np.dtype("O") - for colname, col in frame._data.items() + colname: dtype if str(dtype) in supported_types else np.dtype("O") + for colname, dtype in frame._dtypes } def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { - colname: col.dtype - for colname, col in frame._data.items() - if str(col.dtype) in supported_types + colname: dtype + for colname, dtype in frame._dtypes + if str(dtype) in supported_types } def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): return { colname: col - for colname, col in frame._data.items() + for colname, col in frame._column_labels_and_values if str(col.dtype) in supported_types } @@ -232,8 +230,8 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): *cudautils.make_cache_key( func, tuple(_all_dtypes_from_frame(frame).values()) ), - *(col.mask is None for col in frame._data.values()), - *frame._data.keys(), + *(col.mask is None for col in frame._columns), + *frame._column_names, scalar_argtypes, suffix, ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a9c20150930..3dc8915bfd1 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -186,13 +186,13 @@ def to_csv( "Dataframe doesn't have the labels provided in columns" ) - for col in df._data.columns: - if isinstance(col, cudf.core.column.ListColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.ListDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "list columns." ) - elif isinstance(col, cudf.core.column.StructColumn): + elif isinstance(dtype, cudf.StructDtype): raise NotImplementedError( "Writing to csv format is not yet supported with " "Struct columns." @@ -203,12 +203,11 @@ def to_csv( # workaround once following issue is fixed: # https://github.com/rapidsai/cudf/issues/6661 if any( - isinstance(col, cudf.core.column.CategoricalColumn) - for col in df._data.columns + isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes ) or isinstance(df.index, cudf.CategoricalIndex): df = df.copy(deep=False) - for col_name, col in df._data.items(): - if isinstance(col, cudf.core.column.CategoricalColumn): + for col_name, col in df._column_labels_and_values: + if isinstance(col.dtype, cudf.CategoricalDtype): df._data[col_name] = col.astype(col.categories.dtype) if isinstance(df.index, cudf.CategoricalIndex): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index 1347b2cc38f..fe8e446f9c0 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -79,13 +79,13 @@ def to_dlpack(cudf_obj): ) if any( - not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype) - for col in gdf._data.columns + not cudf.api.types._is_non_decimal_numeric_dtype(dtype) + for _, dtype in gdf._dtypes ): raise TypeError("non-numeric data not yet supported") dtype = cudf.utils.dtypes.find_common_type( - [col.dtype for col in gdf._data.columns] + [dtype for _, dtype in gdf._dtypes] ) gdf = gdf.astype(dtype) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index fd246c6215f..c54293badbe 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -396,8 +396,8 @@ def to_orc( ): """{docstring}""" - for col in df._data.columns: - if isinstance(col, cudf.core.column.CategoricalColumn): + for _, dtype in df._dtypes: + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "Writing to ORC format is not yet supported with " "Categorical columns." diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 62be7378e9e..ce99f98b559 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -64,7 +64,7 @@ def _write_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -149,7 +149,7 @@ def write_to_dataset( return_metadata=False, statistics="ROWGROUP", int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, @@ -205,7 +205,7 @@ def write_to_dataset( If ``False``, timestamps will not be altered. row_group_size_bytes: integer or None, default None Maximum size of each stripe of the output. - If None, 134217728 (128MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -980,7 +980,7 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, - row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT, + row_group_size_bytes=None, row_group_size_rows=None, max_page_size_bytes=None, max_page_size_rows=None, diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index 3a82829eb7a..e0d3d9101a9 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -10,6 +10,7 @@ """ import argparse +import code import runpy import sys import tempfile @@ -21,6 +22,8 @@ @contextmanager def profile(function_profile, line_profile, fn): + if fn is None and (line_profile or function_profile): + raise RuntimeError("Enabling the profiler requires a script name.") if line_profile: with open(fn) as f: lines = f.readlines() @@ -54,6 +57,11 @@ def main(): dest="module", nargs=1, ) + parser.add_argument( + "-c", + dest="cmd", + nargs=1, + ) parser.add_argument( "--profile", action="store_true", @@ -72,9 +80,18 @@ def main(): args = parser.parse_args() + if args.cmd: + f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py") + f.write(args.cmd[0].encode()) + f.seek(0) + args.args.insert(0, f.name) + install() - with profile(args.profile, args.line_profile, args.args[0]) as fn: - args.args[0] = fn + + script_name = args.args[0] if len(args.args) > 0 else None + with profile(args.profile, args.line_profile, script_name) as fn: + if script_name is not None: + args.args[0] = fn if args.module: (module,) = args.module # run the module passing the remaining arguments @@ -85,6 +102,21 @@ def main(): # Remove ourself from argv and continue sys.argv[:] = args.args runpy.run_path(args.args[0], run_name="__main__") + else: + if sys.stdin.isatty(): + banner = f"Python {sys.version} on {sys.platform}" + site_import = not sys.flags.no_site + if site_import: + cprt = 'Type "help", "copyright", "credits" or "license" for more information.' + banner += "\n" + cprt + else: + # Don't show prompts or banners if stdin is not a TTY + sys.ps1 = "" + sys.ps2 = "" + banner = "" + + # Launch an interactive interpreter + code.interact(banner=banner, exitmsg="") if __name__ == "__main__": diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index afa1ce5f86c..0c1cda8810b 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -881,6 +881,26 @@ def _assert_fast_slow_eq(left, right): assert_eq(left, right) +class ProxyFallbackError(Exception): + """Raised when fallback occurs""" + + pass + + +def _fast_function_call(): + """ + Placeholder fast function for pytest profiling purposes. + """ + return None + + +def _slow_function_call(): + """ + Placeholder slow function for pytest profiling purposes. + """ + return None + + def _fast_slow_function_call( func: Callable, /, @@ -910,6 +930,7 @@ def _fast_slow_function_call( # try slow path raise Exception() fast = True + _fast_function_call() if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): try: with nvtx.annotate( @@ -942,6 +963,10 @@ def _fast_slow_function_call( f"The exception was {e}." ) except Exception as err: + if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): + raise ProxyFallbackError( + f"The operation failed with cuDF, the reason was {type(err)}: {err}." + ) from err with nvtx.annotate( "EXECUTE_SLOW", color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], @@ -952,6 +977,7 @@ def _fast_slow_function_call( from ._logger import log_fallback log_fallback(slow_args, slow_kwargs, err) + _slow_function_call() with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 505a40b0bfa..d12d2697729 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -1,10 +1,13 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import contextlib +import json import os import sys +import traceback +from collections import defaultdict from functools import wraps import pytest @@ -36,4 +39,58 @@ def patch_testing_functions(): pytest.raises = replace_kwargs({"match": None})(pytest.raises) +# Dictionary to store function call counts +function_call_counts = {} # type: ignore + +# The specific functions to track +FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"} + + +def find_pytest_file(frame): + stack = traceback.extract_stack() + absolute_paths = [frame.filename for frame in stack] + for file in absolute_paths: + if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[ + -1 + ].startswith("test_"): + return str(file).rsplit("pandas-tests/", 1)[-1] + return None + + +def trace_calls(frame, event, arg): + if event != "call": + return + code = frame.f_code + func_name = code.co_name + + if func_name in FUNCTION_NAME: + filename = find_pytest_file(frame) + if filename is None: + return + if filename not in function_call_counts: + function_call_counts[filename] = defaultdict(int) + function_call_counts[filename][func_name] += 1 + + +def pytest_sessionstart(session): + # Set the profile function to trace calls + sys.setprofile(trace_calls) + + +def pytest_sessionfinish(session, exitstatus): + # Remove the profile function + sys.setprofile(None) + + +@pytest.hookimpl(trylast=True) +def pytest_unconfigure(config): + if hasattr(config, "workerinput"): + # Running in xdist worker, write the counts before exiting + worker_id = config.workerinput["workerid"] + output_file = f"function_call_counts_worker_{worker_id}.json" + with open(output_file, "w") as f: + json.dump(function_call_counts, f, indent=4) + print(f"Function call counts have been written to {output_file}") + + sys.path.append(os.path.dirname(__file__)) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 9c65b74d081..9b9ce026571 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -64,8 +64,6 @@ markers = [ "skip_ubsan: Tests known to fail UBSAN check", ] EOF - # append the contents of patch-confest.py to conftest.py - cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py # Substitute `pandas.tests` with a relative import. # This will depend on the location of the test module relative to @@ -137,7 +135,7 @@ and not test_eof_states \ and not test_array_tz" # TODO: Remove "not db" once a postgres & mysql container is set up on the CI -PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ +PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \ -v -m "not single_cpu and not db" \ -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \ --import-mode=importlib \ @@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ mv *.json .. cd .. - rm -rf pandas-testing/pandas-tests/ diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index ffd2abb960d..4ea0b3b4413 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -12,7 +12,9 @@ """ import argparse +import glob import json +import os from rich.console import Console from rich.table import Table @@ -57,6 +59,44 @@ def get_per_module_results(log_file_name): per_module_results[module_name].setdefault(outcome, 0) per_module_results[module_name]["total"] += 1 per_module_results[module_name][outcome] += 1 + + directory = os.path.dirname(log_file_name) + pattern = os.path.join(directory, "function_call_counts_worker_*.json") + matching_files = glob.glob(pattern) + function_call_counts = {} + + for file in matching_files: + with open(file) as f: + function_call_count = json.load(f) + if not function_call_counts: + function_call_counts.update(function_call_count) + else: + for key, value in function_call_count.items(): + if key not in function_call_counts: + function_call_counts[key] = value + else: + if "_slow_function_call" not in function_call_counts[key]: + function_call_counts[key]["_slow_function_call"] = 0 + if "_fast_function_call" not in function_call_counts[key]: + function_call_counts[key]["_fast_function_call"] = 0 + function_call_counts[key]["_slow_function_call"] += ( + value.get("_slow_function_call", 0) + ) + function_call_counts[key]["_fast_function_call"] += ( + value.get("_fast_function_call", 0) + ) + + for key, value in per_module_results.items(): + if key in function_call_counts: + per_module_results[key]["_slow_function_call"] = ( + function_call_counts[key].get("_slow_function_call", 0) + ) + per_module_results[key]["_fast_function_call"] = ( + function_call_counts[key].get("_fast_function_call", 0) + ) + else: + per_module_results[key]["_slow_function_call"] = 0 + per_module_results[key]["_fast_function_call"] = 0 return per_module_results diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 31ad24a4664..668e7a77454 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -676,7 +676,7 @@ def assert_frame_equal( if check_like: left, right = left.reindex(index=right.index), right - right = right[list(left._data.names)] + right = right[list(left._column_names)] # index comparison assert_index_equal( diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini index 2136bca0e28..d05ba9aaacc 100644 --- a/python/cudf/cudf/tests/pytest.ini +++ b/python/cudf/cudf/tests/pytest.ini @@ -14,3 +14,6 @@ filterwarnings = ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning # PerformanceWarning from cupy warming up the JIT cache ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning + # Ignore numba PEP 456 warning specific to arm machines + ignore:FNV hashing is not implemented in Numba.*:UserWarning +addopts = --tb=native diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f4d1578bda7..6f88d942746 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -515,6 +515,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace): assert_eq(expected, actual) +@pytest.mark.parametrize("obj", ["Index", "Series"]) +def test_drop_cudf_obj_columns(obj): + pdf = pd.DataFrame({"A": [1], "B": [1]}) + gdf = cudf.from_pandas(pdf) + + columns = ["B"] + expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1) + actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1) + assert_eq(expected, actual) + + @pytest.mark.parametrize( "pdf", [ diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index b1e095e8853..c41be3e4428 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -813,8 +813,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): mi1 = gdf.groupby(["Date", "Symbol"]).mean().index mi2 = mi1.copy(deep=deep) - lchildren = [col.children for _, col in mi1._data.items()] - rchildren = [col.children for _, col in mi2._data.items()] + lchildren = [col.children for col in mi1._columns] + rchildren = [col.children for col in mi2._columns] # Flatten lchildren = reduce(operator.add, lchildren) @@ -849,12 +849,8 @@ def test_multiindex_copy_deep(data, copy_on_write, deep): assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity - lptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi1._data.items() - ] - rptrs = [ - d.base_data.get_ptr(mode="read") for _, d in mi2._data.items() - ] + lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] + rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) cudf.set_option("copy_on_write", original_cow_setting) diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index cc17dc46e0a..e054143b438 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated(): df = cudf.DataFrame(range(3)) with pytest.warns(FutureWarning): cudf.get_dummies(df, cats={0: [0, 1, 2]}) + + +def test_get_dummies_drop_first_series(): + result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True) + expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) + assert_eq(result, expected) + + +def test_get_dummies_drop_first_dataframe(): + result = cudf.get_dummies( + cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), + drop_first=True, + ) + expected = pd.get_dummies( + pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True + ) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 8b59a7eef08..7f1b0b1cd46 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema): df1 = cudf.DataFrame( { "i32": cudf.Series([None, None, None], dtype="int32"), - "i64": cudf.Series([1234, None, 123], dtype="int64"), - "list": list([[1, 2], [None, 4], [5, 6]]), + "i64": cudf.Series([1234, 467, 123], dtype="int64"), + "list": list([[1, 2], None, [None, 6]]), "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), "str": ["vfd", None, "ghu"], "d_list": list( @@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema): df2 = cudf.DataFrame( { - "str": ["abc", "def", None], + "str": ["abc", "def", "ghi"], "i64": cudf.Series([None, 65, 98], dtype="int64"), "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"), - "list": list([[7, 8], [9, 10], [None, 12]]), + "list": list([[7, 8], [9, 10], [11, 12]]), "d_list": list( [ [pd.Timedelta(minutes=4), None], - [None, None], + None, [pd.Timedelta(minutes=6), None], ] ), @@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs(): { "a": 1, "b": { - "inner_a": 10, - "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2}, + "a_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 2}, }, "c": 2, }, { "a": 3, - "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}}, + "b": {"b_a": 30, "b_b": {"b_b_a": 210}}, "c": 4, }, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, + {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6}, {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - { - "a": None, - "b": { - "inner_a": None, - "inner_b": {"inner_inner_b": None, "inner_inner_a": 10}, - }, - "c": 10, - }, + {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None}, ] data2 = [ - {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}}, - {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}}, - {"a": 5, "b": {"inner_b": None}}, - {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}}, - {"a": None, "b": {"inner_b": None}}, + {"a": 1, "b": {"b_b": {"b_b_a": None}}}, + {"a": 5, "b": {"b_b": None}}, + {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}}, + {"a": None, "b": {"b_b": None}}, None, - {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}}, ] # cuDF tables from struct data @@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs(): # Read the struct.b.inner_b.inner_inner_a column from parquet got = cudf.read_parquet( [buf1, buf2], - columns=["struct.b.inner_b.inner_inner_a"], + columns=["struct.b.b_b.b_b_a"], allow_mismatched_pq_schemas=True, ) got = ( cudf.Series(got["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a") + .struct.field("b_b") + .struct.field("b_b_a") ) # Read with chunked reader got_chunked = read_parquet_chunked( [buf1, buf2], - columns=["struct.b.inner_b.inner_inner_a"], + columns=["struct.b.b_b.b_b_a"], chunk_read_limit=240, pass_read_limit=240, allow_mismatched_pq_schemas=True, @@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs(): got_chunked = ( cudf.Series(got_chunked["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a") + .struct.field("b_b") + .struct.field("b_b_a") ) # Construct the expected series @@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs(): [ cudf.Series(df1["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a"), + .struct.field("b_b") + .struct.field("b_b_a"), cudf.Series(df2["struct"]) .struct.field("b") - .struct.field("inner_b") - .struct.field("inner_inner_a"), + .struct.field("b_b") + .struct.field("b_b_a"), ] ).reset_index(drop=True) @@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error(): ) data1 = [ - {"a": 1, "b": {"inner_a": 1, "inner_b": 6}}, - {"a": 3, "b": {"inner_a": None, "inner_b": 2}}, + {"a": 1, "b": {"b_a": 1, "b_b": 6}}, + {"a": 3, "b": {"b_a": None, "b_b": 2}}, ] data2 = [ - {"b": {"inner_a": 1}, "c": "str"}, - {"b": {"inner_a": None}, "c": None}, + {"b": {"b_a": 1}, "c": "str"}, + {"b": {"b_a": None}, "c": None}, ] # cuDF tables from struct data @@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error(): ): cudf.read_parquet( [buf1, buf2], - columns=["struct.b.inner_b"], + columns=["struct.b.b_b"], allow_mismatched_pq_schemas=True, ) + + +def test_parquet_reader_mismatched_nullability(): + # Ensure that we can faithfully read the tables with mismatched nullabilities + df1 = cudf.DataFrame( + { + "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + None, + [pd.Timedelta(minutes=8), None], + ], + None, + ], + None, + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, None, 4123], dtype="int64"), + "int32": cudf.Series([1234, 123, 4123], dtype="int32"), + "list": list([[1, 2], [1, 2], [1, 2]]), + "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), + "string": cudf.Series(["kitten", "puppy", "cub"]), + } + ) + + df2 = cudf.DataFrame( + { + "timedelta": cudf.Series( + [None, None, None], dtype="timedelta64[ms]" + ), + "duration_list": list( + [ + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)], + ], + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + [ + [ + [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], + [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], + [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], + ] + ], + ] + ), + "int64": cudf.Series([1234, 123, 4123], dtype="int64"), + "int32": cudf.Series([1234, None, 4123], dtype="int32"), + "list": list([[1, 2], None, [1, 2]]), + "datetime": cudf.Series( + [1234, None, 4123], dtype="datetime64[ms]" + ), + "string": cudf.Series(["kitten", None, "cub"]), + } + ) + + # Write tables to parquet with arrow schema for compatibility for duration column(s) + fname1 = BytesIO() + df1.to_parquet(fname1, store_schema=True) + fname2 = BytesIO() + df2.to_parquet(fname2, store_schema=True) + + # Read tables back with cudf and arrow in either order and compare + assert_eq( + cudf.read_parquet([fname1, fname2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([fname2, fname1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) + + +def test_parquet_reader_mismatched_nullability_structs(tmpdir): + data1 = [ + { + "a": "a", + "b": { + "b_a": 10, + "b_b": {"b_b_b": 1, "b_b_a": 12}, + }, + "c": [1, 2], + }, + { + "a": "b", + "b": { + "b_a": 30, + "b_b": {"b_b_b": 2, "b_b_a": 2}, + }, + "c": [3, 4], + }, + { + "a": "c", + "b": { + "b_a": 50, + "b_b": {"b_b_b": 4, "b_b_a": 5}, + }, + "c": [5, 6], + }, + { + "a": "d", + "b": { + "b_a": 135, + "b_b": {"b_b_b": 12, "b_b_a": 32}, + }, + "c": [7, 8], + }, + { + "a": "e", + "b": { + "b_a": 1, + "b_b": {"b_b_b": 1, "b_b_a": 5}, + }, + "c": [9, 10], + }, + { + "a": "f", + "b": { + "b_a": 32, + "b_b": {"b_b_b": 1, "b_b_a": 6}, + }, + "c": [11, 12], + }, + ] + + data2 = [ + { + "a": "g", + "b": { + "b_a": 10, + "b_b": {"b_b_b": None, "b_b_a": 2}, + }, + "c": None, + }, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, + {"a": "j", "b": None, "c": [8, 10]}, + {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, + None, + { + "a": None, + "b": {"b_a": None, "b_b": {"b_b_b": 1}}, + "c": [18, 19], + }, + {"a": None, "b": None, "c": None}, + ] + + pa_table1 = pa.Table.from_pydict({"struct": data1}) + df1 = cudf.DataFrame.from_arrow(pa_table1) + + pa_table2 = pa.Table.from_pydict({"struct": data2}) + df2 = cudf.DataFrame.from_arrow(pa_table2) + + # Write tables to parquet + buf1 = BytesIO() + df1.to_parquet(buf1) + buf2 = BytesIO() + df2.to_parquet(buf2) + + # Read tables back with cudf and compare with expected. + assert_eq( + cudf.read_parquet([buf1, buf2]), + cudf.concat([df1, df2]).reset_index(drop=True), + ) + assert_eq( + cudf.read_parquet([buf2, buf1]), + cudf.concat([df2, df1]).reset_index(drop=True), + ) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 52179f55da3..997ca357986 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -946,6 +946,66 @@ def test_minhash(): strings.str.minhash64(seeds=seeds) +def test_word_minhash(): + ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) + + expected = cudf.Series( + [ + cudf.Series([21141582], dtype=np.uint32), + cudf.Series([962346254], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), + cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), + ] + ) + actual = ls.str.word_minhash(seeds=seeds) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + cudf.Series([2603139454418834912], dtype=np.uint64), + cudf.Series([5240044617220523711], dtype=np.uint64), + ] + ) + actual = ls.str.word_minhash64() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + expected = cudf.Series( + [ + cudf.Series( + [ + 2603139454418834912, + 8644371945174847701, + 5541030711534384340, + ], + dtype=np.uint64, + ), + cudf.Series( + [5240044617220523711, 5847101123925041457, 153762819128779913], + dtype=np.uint64, + ), + ] + ) + actual = ls.str.word_minhash64(seeds=seeds) + assert_eq(expected, actual) + + # test wrong seed types + with pytest.raises(ValueError): + ls.str.word_minhash(seeds="a") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + ls.str.word_minhash(seeds=seeds) + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + ls.str.word_minhash64(seeds=seeds) + + def test_jaccard_index(): str1 = cudf.Series(["the brown dog", "jumped about"]) str2 = cudf.Series(["the black cat", "jumped around"]) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1627107b57d..1180da321e6 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -27,7 +27,7 @@ fsspec_parquet = None _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024 +_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max _docstring_remote_sources = """ - cuDF supports local and remote data stores. See configuration details for @@ -275,10 +275,9 @@ timestamp[us] to the int96 format, which is the number of Julian days and the number of nanoseconds since midnight of 1970-01-01. If ``False``, timestamps will not be altered. -row_group_size_bytes: integer, default {row_group_size_bytes_val} +row_group_size_bytes: integer, default None Maximum size of each stripe of the output. - If None, {row_group_size_bytes_val} - ({row_group_size_bytes_val_in_mb} MB) will be used. + If None, no limit on row group stripe size will be used. row_group_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. @@ -346,10 +345,7 @@ See Also -------- cudf.read_parquet -""".format( - row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT, - row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024, -) +""" doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) _docstring_merge_parquet_filemetadata = """ diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index c4ab4b0a853..2bbed40e34e 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -26,7 +26,11 @@ from cudf.core._compat import PANDAS_GE_220 from cudf.pandas import LOADED, Profiler -from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object +from cudf.pandas.fast_slow_proxy import ( + ProxyFallbackError, + _Unusable, + is_proxy_object, +) from cudf.testing import assert_eq if not LOADED: @@ -1738,3 +1742,13 @@ def add_one_ufunc(a): return a + 1 assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2))) + + +@pytest.mark.xfail( + reason="Fallback expected because casting to object is not supported", +) +def test_fallback_raises_error(monkeypatch): + with monkeypatch.context() as monkeycontext: + monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + with pytest.raises(ProxyFallbackError): + pd.Series(range(2)).astype(object) diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py new file mode 100644 index 00000000000..896256bf6d7 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas_no_fallback.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from cudf.pandas import LOADED + +if not LOADED: + raise ImportError("These tests must be run with cudf.pandas loaded") + +import numpy as np +import pandas as pd + + +@pytest.fixture(autouse=True) +def fail_on_fallback(monkeypatch): + monkeypatch.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") + + +@pytest.fixture +def dataframe(): + df = pd.DataFrame( + { + "a": [1, 1, 1, 2, 3], + "b": [1, 2, 3, 4, 5], + "c": [1.2, 1.3, 1.5, 1.7, 1.11], + } + ) + return df + + +@pytest.fixture +def series(dataframe): + return dataframe["a"] + + +@pytest.fixture +def array(series): + return series.values + + +@pytest.mark.parametrize( + "op", + [ + "sum", + "min", + "max", + "mean", + "std", + "var", + "prod", + "median", + ], +) +def test_no_fallback_in_reduction_ops(series, op): + s = series + getattr(s, op)() + + +def test_groupby(dataframe): + df = dataframe + df.groupby("a", sort=True).max() + + +def test_no_fallback_in_binops(dataframe): + df = dataframe + df + df + df - df + df * df + df**df + df[["a", "b"]] & df[["a", "b"]] + df <= df + + +def test_no_fallback_in_groupby_rolling_sum(dataframe): + df = dataframe + df.groupby("a").rolling(2).sum() + + +def test_no_fallback_in_concat(dataframe): + df = dataframe + pd.concat([df, df]) + + +def test_no_fallback_in_get_shape(dataframe): + df = dataframe + df.shape + + +def test_no_fallback_in_array_ufunc_op(array): + np.add(array, array) + + +def test_no_fallback_in_merge(dataframe): + df = dataframe + pd.merge(df * df, df + df, how="inner") + pd.merge(df * df, df + df, how="outer") + pd.merge(df * df, df + df, how="left") + pd.merge(df * df, df + df, how="right") diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py new file mode 100644 index 00000000000..326224c8fc0 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/test_main.py @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import tempfile +import textwrap + + +def _run_python(*, cudf_pandas, command): + executable = "python " + if cudf_pandas: + executable += "-m cudf.pandas " + return subprocess.run( + executable + command, + shell=True, + capture_output=True, + check=True, + text=True, + ) + + +def test_run_cudf_pandas_with_script(): + with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f: + code = textwrap.dedent( + """ + import pandas as pd + df = pd.DataFrame({'a': [1, 2, 3]}) + print(df['a'].sum()) + """ + ) + f.write(code) + f.flush() + + res = _run_python(cudf_pandas=True, command=f.name) + expect = _run_python(cudf_pandas=False, command=f.name) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args(): + input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert res.stdout != "" + assert res.stdout == expect.stdout + + +def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf(): + """Verify that cudf is active with -m cudf.pandas.""" + input_args_and_code = """-c 'import pandas as pd; print(pd)'""" + + res = _run_python(cudf_pandas=True, command=input_args_and_code) + expect = _run_python(cudf_pandas=False, command=input_args_and_code) + + assert "cudf" in res.stdout + assert "cudf" not in expect.stdout + + +def test_cudf_pandas_script_repl(): + def start_repl_process(cmd): + return subprocess.Popen( + cmd.split(), + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True, + ) + + def get_repl_output(process, commands): + for command in commands: + process.stdin.write(command) + process.stdin.flush() + return process.communicate() + + p1 = start_repl_process("python -m cudf.pandas") + p2 = start_repl_process("python") + commands = [ + "import pandas as pd\n", + "print(pd.Series(range(2)).sum())\n", + "print(pd.Series(range(5)).sum())\n", + "import sys\n", + "print(pd.Series(list('abcd')), out=sys.stderr)\n", + ] + + res = get_repl_output(p1, commands) + expect = get_repl_output(p2, commands) + + # Check stdout + assert res[0] != "" + assert res[0] == expect[0] + + # Check stderr + assert res[1] != "" + assert res[1] == expect[1] + + p1.kill() + p2.kill() diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini index 817d98e6ba2..98459035298 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini @@ -1,3 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + [pytest] xfail_strict=true markers= @@ -5,3 +7,4 @@ markers= xfail_gold: this test is expected to fail in the gold pass xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass xfail_compare: this test is expected to fail in the comparison pass +addopts = --tb=native diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/cudf_kafka/cudf_kafka/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py index 41d06f8631b..66c15f694ee 100644 --- a/python/cudf_polars/cudf_polars/__init__.py +++ b/python/cudf_polars/cudf_polars/__init__.py @@ -14,6 +14,12 @@ from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir +# Check we have a supported polars version +from cudf_polars.utils.versions import _ensure_polars_version + +_ensure_polars_version() +del _ensure_polars_version + __all__: list[str] = [ "execute_with_cudf", "translate_ir", diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index f31193aa938..76816ee0a61 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -5,19 +5,26 @@ from __future__ import annotations +import contextlib import os import warnings -from functools import partial +from functools import cache, partial from typing import TYPE_CHECKING import nvtx -from polars.exceptions import PerformanceWarning +from polars.exceptions import ComputeError, PerformanceWarning + +import rmm +from rmm._cuda import gpu from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: + from collections.abc import Generator + import polars as pl + from polars import GPUEngine from cudf_polars.dsl.ir import IR from cudf_polars.typing import NodeTraverser @@ -25,23 +32,126 @@ __all__: list[str] = ["execute_with_cudf"] +@cache +def default_memory_resource(device: int) -> rmm.mr.DeviceMemoryResource: + """ + Return the default memory resource for cudf-polars. + + Parameters + ---------- + device + Disambiguating device id when selecting the device. Must be + the active device when this function is called. + + Returns + ------- + rmm.mr.DeviceMemoryResource + The default memory resource that cudf-polars uses. Currently + an async pool resource. + """ + try: + return rmm.mr.CudaAsyncMemoryResource() + except RuntimeError as e: # pragma: no cover + msg, *_ = e.args + if ( + msg.startswith("RMM failure") + and msg.find("not supported with this CUDA driver/runtime version") > -1 + ): + raise ComputeError( + "GPU engine requested, but incorrect cudf-polars package installed. " + "If your system has a CUDA 11 driver, please uninstall `cudf-polars-cu12` " + "and install `cudf-polars-cu11`" + ) from None + else: + raise + + +@contextlib.contextmanager +def set_memory_resource( + mr: rmm.mr.DeviceMemoryResource | None, +) -> Generator[rmm.mr.DeviceMemoryResource, None, None]: + """ + Set the current memory resource for an execution block. + + Parameters + ---------- + mr + Memory resource to use. If `None`, calls :func:`default_memory_resource` + to obtain an mr on the currently active device. + + Returns + ------- + Memory resource used. + + Notes + ----- + At exit, the memory resource is restored to whatever was current + at entry. If a memory resource is provided, it must be valid to + use with the currently active device. + """ + if mr is None: + device: int = gpu.getDevice() + mr = default_memory_resource(device) + previous = rmm.mr.get_current_device_resource() + rmm.mr.set_current_device_resource(mr) + try: + yield mr + finally: + rmm.mr.set_current_device_resource(previous) + + +@contextlib.contextmanager +def set_device(device: int | None) -> Generator[int, None, None]: + """ + Set the device the query is executed on. + + Parameters + ---------- + device + Device to use. If `None`, uses the current device. + + Returns + ------- + Device active for the execution of the block. + + Notes + ----- + At exit, the device is restored to whatever was current at entry. + """ + previous: int = gpu.getDevice() + if device is not None: + gpu.setDevice(device) + try: + yield previous + finally: + gpu.setDevice(previous) + + def _callback( ir: IR, with_columns: list[str] | None, pyarrow_predicate: str | None, n_rows: int | None, + *, + device: int | None, + memory_resource: int | None, ) -> pl.DataFrame: assert with_columns is None assert pyarrow_predicate is None assert n_rows is None - with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"): + with ( + nvtx.annotate(message="ExecuteIR", domain="cudf_polars"), + # Device must be set before memory resource is obtained. + set_device(device), + set_memory_resource(memory_resource), + ): return ir.evaluate(cache={}).to_polars() def execute_with_cudf( nt: NodeTraverser, *, - raise_on_fail: bool = False, + config: GPUEngine, exception: type[Exception] | tuple[type[Exception], ...] = Exception, ) -> None: """ @@ -52,9 +162,8 @@ def execute_with_cudf( nt NodeTraverser - raise_on_fail - Should conversion raise an exception rather than continuing - without setting a callback. + config + GPUEngine configuration object exception Optional exception, or tuple of exceptions, to catch during @@ -62,9 +171,23 @@ def execute_with_cudf( The NodeTraverser is mutated if the libcudf executor can handle the plan. """ + device = config.device + memory_resource = config.memory_resource + raise_on_fail = config.config.get("raise_on_fail", False) + if unsupported := (config.config.keys() - {"raise_on_fail"}): + raise ValueError( + f"Engine configuration contains unsupported settings {unsupported}" + ) try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): - nt.set_udf(partial(_callback, translate_ir(nt))) + nt.set_udf( + partial( + _callback, + translate_ir(nt), + device=device, + memory_resource=memory_resource, + ) + ) except exception as e: if bool(int(os.environ.get("POLARS_VERBOSE", 0))): warnings.warn( diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index dd3b771e305..3fe3e5557cb 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -84,6 +84,34 @@ def sorted_like(self, like: Column, /) -> Self: is_sorted=like.is_sorted, order=like.order, null_order=like.null_order ) + # TODO: Return Column once #16272 is fixed. + def astype(self, dtype: plc.DataType) -> plc.Column: + """ + Return the backing column as the requested dtype. + + Parameters + ---------- + dtype + Datatype to cast to. + + Returns + ------- + Column of requested type. + + Raises + ------ + RuntimeError + If the cast is unsupported. + + Notes + ----- + This only produces a copy if the requested dtype doesn't match + the current one. + """ + if self.obj.type() != dtype: + return plc.unary.cast(self.obj, dtype) + return self.obj + def copy_metadata(self, from_: pl.Series, /) -> Self: """ Copy metadata from a host series onto self. diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index a5c99e2bc11..f3e3862d0cc 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -7,7 +7,7 @@ import itertools from functools import cached_property -from typing import TYPE_CHECKING, cast +from typing import TYPE_CHECKING import pyarrow as pa import pylibcudf as plc @@ -45,11 +45,19 @@ def copy(self) -> Self: def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" + # If the arrow table has empty names, from_arrow produces + # column_$i. But here we know there is only one such column + # (by construction) and it should have an empty name. + # https://github.com/pola-rs/polars/issues/11632 + # To guarantee we produce correct names, we therefore + # serialise with names we control and rename with that map. + name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} table: pa.Table = plc.interop.to_arrow( self.table, - [plc.interop.ColumnMetadata(name=c.name) for c in self.columns], + [plc.interop.ColumnMetadata(name=name) for name in name_map], ) - return cast(pl.DataFrame, pl.from_arrow(table)).with_columns( + df: pl.DataFrame = pl.from_arrow(table) + return df.rename(name_map).with_columns( *( pl.col(c.name).set_sorted( descending=c.order == plc.types.Order.DESCENDING diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index e1b4d30b76b..c401e5a2f17 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -21,8 +21,10 @@ from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple import pyarrow as pa +import pyarrow.compute as pc import pylibcudf as plc +from polars.exceptions import InvalidOperationError from polars.polars import _expr_nodes as pl_expr from cudf_polars.containers import Column, NamedColumn @@ -477,12 +479,6 @@ def __init__( self.options = options self.name = name self.children = children - if ( - self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All) - and not self.options[0] - ): - # With ignore_nulls == False, polars uses Kleene logic - raise NotImplementedError(f"Kleene logic for {self.name}") if self.name == pl_expr.BooleanFunction.IsIn and not all( c.dtype == self.children[0].dtype for c in self.children ): @@ -577,20 +573,31 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.BooleanFunction.Any: + # Kleene logic for Any (OR) and All (AND) if ignore_nulls is + # False + if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + (ignore_nulls,) = self.options (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1 - ) - ) - elif self.name == pl_expr.BooleanFunction.All: - (column,) = columns - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1 - ) - ) + is_any = self.name == pl_expr.BooleanFunction.Any + agg = plc.aggregation.any() if is_any else plc.aggregation.all() + result = plc.reduce.reduce(column.obj, agg, self.dtype) + if not ignore_nulls and column.obj.null_count() > 0: + # Truth tables + # Any All + # | F U T | F U T + # --+------ --+------ + # F | F U T F | F F F + # U | U U T U | F U U + # T | T T T T | F U T + # + # If the input null count was non-zero, we must + # post-process the result to insert the correct value. + h_result = plc.interop.to_arrow(result).as_py() + if is_any and not h_result or not is_any and h_result: + # Any All + # False || Null => Null True && Null => Null + return Column(plc.Column.all_null_like(column.obj, 1)) + return Column(plc.Column.from_scalar(result, 1)) if self.name == pl_expr.BooleanFunction.IsNull: (column,) = columns return Column(plc.unary.is_null(column.obj)) @@ -598,13 +605,19 @@ def do_evaluate( (column,) = columns return Column(plc.unary.is_valid(column.obj)) elif self.name == pl_expr.BooleanFunction.IsNan: - # TODO: copy over null mask since is_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_nan(column.obj)) + return Column( + plc.unary.is_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsNotNan: - # TODO: copy over null mask since is_not_nan(null) => null in polars (column,) = columns - return Column(plc.unary.is_not_nan(column.obj)) + return Column( + plc.unary.is_not_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: (column,) = columns return self._distinct( @@ -654,26 +667,22 @@ def do_evaluate( ), ) elif self.name == pl_expr.BooleanFunction.AllHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for all_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_AND, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, output_type=self.dtype, ), (c.obj for c in columns), ) ) elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - if any(c.obj.null_count() > 0 for c in columns): - raise NotImplementedError("Kleene logic for any_horizontal") return Column( reduce( partial( plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.BITWISE_OR, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, output_type=self.dtype, ), (c.obj for c in columns), @@ -694,7 +703,7 @@ def do_evaluate( class StringFunction(Expr): - __slots__ = ("name", "options", "children") + __slots__ = ("name", "options", "children", "_regex_program") _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -713,12 +722,18 @@ def __init__( def _validate_input(self): if self.name not in ( - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Uppercase, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.StartsWith, pl_expr.StringFunction.Contains, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Replace, + pl_expr.StringFunction.ReplaceMany, pl_expr.StringFunction.Slice, + pl_expr.StringFunction.Strptime, + pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + pl_expr.StringFunction.Uppercase, ): raise NotImplementedError(f"String function {self.name}") if self.name == pl_expr.StringFunction.Contains: @@ -732,11 +747,65 @@ def _validate_input(self): raise NotImplementedError( "Regex contains only supports a scalar pattern" ) + pattern = self.children[1].value.as_py() + try: + self._regex_program = plc.strings.regex_program.RegexProgram.create( + pattern, + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + except RuntimeError as e: + raise NotImplementedError( + f"Unsupported regex {pattern} for GPU engine." + ) from e + elif self.name == pl_expr.StringFunction.Replace: + _, literal = self.options + if not literal: + raise NotImplementedError("literal=False is not supported for replace") + if not all(isinstance(expr, Literal) for expr in self.children[1:]): + raise NotImplementedError("replace only supports scalar target") + target = self.children[1] + if target.value == pa.scalar("", type=pa.string()): + raise NotImplementedError( + "libcudf replace does not support empty strings" + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + (ascii_case_insensitive,) = self.options + if ascii_case_insensitive: + raise NotImplementedError( + "ascii_case_insensitive not implemented for replace_many" + ) + if not all( + isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] + ): + raise NotImplementedError("replace_many only supports literal inputs") + target = self.children[1] + if pc.any(pc.equal(target.value, "")).as_py(): + raise NotImplementedError( + "libcudf replace_many is implemented differently from polars " + "for empty strings" + ) elif self.name == pl_expr.StringFunction.Slice: if not all(isinstance(child, Literal) for child in self.children[1:]): raise NotImplementedError( "Slice only supports literal start and stop values" ) + elif self.name == pl_expr.StringFunction.Strptime: + format, _, exact, cache = self.options + if cache: + raise NotImplementedError("Strptime cache is a CPU feature") + if format is None: + raise NotImplementedError("Strptime format is required") + if not exact: + raise NotImplementedError("Strptime does not support exact=False") + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "strip operations only support scalar patterns" + ) def do_evaluate( self, @@ -759,12 +828,10 @@ def do_evaluate( else pat.obj ) return Column(plc.strings.find.contains(column.obj, pattern)) - assert isinstance(arg, Literal) - prog = plc.strings.regex_program.RegexProgram.create( - arg.value.as_py(), - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - return Column(plc.strings.contains.contains_re(column.obj, prog)) + else: + return Column( + plc.strings.contains.contains_re(column.obj, self._regex_program) + ) elif self.name == pl_expr.StringFunction.Slice: child, expr_offset, expr_length = self.children assert isinstance(expr_offset, Literal) @@ -795,6 +862,22 @@ def do_evaluate( plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), ) ) + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = ( + c.evaluate(df, context=context, mapping=mapping) for c in self.children + ) + if self.name == pl_expr.StringFunction.StripCharsStart: + side = plc.strings.SideType.LEFT + elif self.name == pl_expr.StringFunction.StripCharsEnd: + side = plc.strings.SideType.RIGHT + else: + side = plc.strings.SideType.BOTH + return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) + columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -825,6 +908,51 @@ def do_evaluate( else prefix.obj, ) ) + elif self.name == pl_expr.StringFunction.Strptime: + # TODO: ignores ambiguous + format, strict, exact, cache = self.options + col = self.children[0].evaluate(df, context=context, mapping=mapping) + + is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( + col.obj, format.encode() + ) + + if strict: + if not plc.interop.to_arrow( + plc.reduce.reduce( + is_timestamps, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("conversion from `str` failed.") + else: + not_timestamps = plc.unary.unary_operation( + is_timestamps, plc.unary.UnaryOperator.NOT + ) + + null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) + res = plc.copying.boolean_mask_scatter( + [null], plc.Table([col.obj]), not_timestamps + ) + return Column( + plc.strings.convert.convert_datetime.to_timestamps( + res.columns()[0], self.dtype, format.encode() + ) + ) + elif self.name == pl_expr.StringFunction.Replace: + column, target, repl = columns + n, _ = self.options + return Column( + plc.strings.replace.replace( + column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n + ) + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + column, target, repl = columns + return Column( + plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) + ) raise NotImplementedError( f"StringFunction {self.name}" ) # pragma: no cover; handled by init raising @@ -832,6 +960,18 @@ def do_evaluate( class TemporalFunction(Expr): __slots__ = ("name", "options", "children") + _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + pl_expr.TemporalFunction.Year: "year", + pl_expr.TemporalFunction.Month: "month", + pl_expr.TemporalFunction.Day: "day", + pl_expr.TemporalFunction.WeekDay: "weekday", + pl_expr.TemporalFunction.Hour: "hour", + pl_expr.TemporalFunction.Minute: "minute", + pl_expr.TemporalFunction.Second: "second", + pl_expr.TemporalFunction.Millisecond: "millisecond", + pl_expr.TemporalFunction.Microsecond: "microsecond", + pl_expr.TemporalFunction.Nanosecond: "nanosecond", + } _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] @@ -846,8 +986,8 @@ def __init__( self.options = options self.name = name self.children = children - if self.name != pl_expr.TemporalFunction.Year: - raise NotImplementedError(f"String function {self.name}") + if self.name not in self._COMPONENT_MAP: + raise NotImplementedError(f"Temporal function {self.name}") def do_evaluate( self, @@ -861,12 +1001,59 @@ def do_evaluate( child.evaluate(df, context=context, mapping=mapping) for child in self.children ] - if self.name == pl_expr.TemporalFunction.Year: - (column,) = columns - return Column(plc.datetime.extract_year(column.obj)) - raise NotImplementedError( - f"TemporalFunction {self.name}" - ) # pragma: no cover; init trips first + (column,) = columns + if self.name == pl_expr.TemporalFunction.Microsecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + millis_as_micros = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.DataType(plc.TypeId.INT32), + ) + total_micros = plc.binaryop.binary_operation( + micros, + millis_as_micros, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_micros) + elif self.name == pl_expr.TemporalFunction.Nanosecond: + millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") + micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") + nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") + millis_as_nanos = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + micros_as_nanos = plc.binaryop.binary_operation( + micros, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + nanos, + millis_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + total_nanos, + micros_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_nanos) + + return Column( + plc.datetime.extract_datetime_component( + column.obj, + self._COMPONENT_MAP[self.name], + ) + ) class UnaryFunction(Expr): @@ -874,6 +1061,51 @@ class UnaryFunction(Expr): _non_child = ("dtype", "name", "options") children: tuple[Expr, ...] + # Note: log, and pow are handled via translation to binops + _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { + "sin": plc.unary.UnaryOperator.SIN, + "cos": plc.unary.UnaryOperator.COS, + "tan": plc.unary.UnaryOperator.TAN, + "arcsin": plc.unary.UnaryOperator.ARCSIN, + "arccos": plc.unary.UnaryOperator.ARCCOS, + "arctan": plc.unary.UnaryOperator.ARCTAN, + "sinh": plc.unary.UnaryOperator.SINH, + "cosh": plc.unary.UnaryOperator.COSH, + "tanh": plc.unary.UnaryOperator.TANH, + "arcsinh": plc.unary.UnaryOperator.ARCSINH, + "arccosh": plc.unary.UnaryOperator.ARCCOSH, + "arctanh": plc.unary.UnaryOperator.ARCTANH, + "exp": plc.unary.UnaryOperator.EXP, + "sqrt": plc.unary.UnaryOperator.SQRT, + "cbrt": plc.unary.UnaryOperator.CBRT, + "ceil": plc.unary.UnaryOperator.CEIL, + "floor": plc.unary.UnaryOperator.FLOOR, + "abs": plc.unary.UnaryOperator.ABS, + "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, + "not": plc.unary.UnaryOperator.NOT, + } + _supported_misc_fns = frozenset( + { + "drop_nulls", + "fill_null", + "mask_nans", + "round", + "set_sorted", + "unique", + } + ) + _supported_cum_aggs = frozenset( + { + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + } + ) + _supported_fns = frozenset().union( + _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() + ) + def __init__( self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr ) -> None: @@ -881,15 +1113,15 @@ def __init__( self.name = name self.options = options self.children = children - if self.name not in ( - "mask_nans", - "round", - "setsorted", - "unique", - "dropnull", - "fill_null", - ): + + if self.name not in UnaryFunction._supported_fns: raise NotImplementedError(f"Unary function {name=}") + if self.name in UnaryFunction._supported_cum_aggs: + (reverse,) = self.options + if reverse: + raise NotImplementedError( + "reverse=True is not supported for cumulative aggregations" + ) def do_evaluate( self, @@ -947,7 +1179,7 @@ def do_evaluate( if maintain_order: return Column(column).sorted_like(values) return Column(column) - elif self.name == "setsorted": + elif self.name == "set_sorted": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -974,7 +1206,7 @@ def do_evaluate( order=order, null_order=null_order, ) - elif self.name == "dropnull": + elif self.name == "drop_nulls": (column,) = ( child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -994,13 +1226,65 @@ def do_evaluate( ) arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj return Column(plc.replace.replace_nulls(column.obj, arg)) - + elif self.name in self._OP_MAPPING: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if column.obj.type().id() != self.dtype.id(): + arg = plc.unary.cast(column.obj, self.dtype) + else: + arg = column.obj + return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) + elif self.name in UnaryFunction._supported_cum_aggs: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + plc_col = column.obj + col_type = column.obj.type() + # cum_sum casts + # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention + # Bool -> UInt32 + # cum_prod casts integer dtypes < int64 and bool to int64 + # See: + # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs + if ( + self.name == "cum_sum" + and col_type.id() + in { + plc.types.TypeId.INT8, + plc.types.TypeId.UINT8, + plc.types.TypeId.INT16, + plc.types.TypeId.UINT16, + } + ) or ( + self.name == "cum_prod" + and plc.traits.is_integral(col_type) + and plc.types.size_of(col_type) <= 4 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.INT64) + ) + elif ( + self.name == "cum_sum" + and column.obj.type().id() == plc.types.TypeId.BOOL8 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.UINT32) + ) + if self.name == "cum_sum": + agg = plc.aggregation.sum() + elif self.name == "cum_prod": + agg = plc.aggregation.product() + elif self.name == "cum_min": + agg = plc.aggregation.min() + elif self.name == "cum_max": + agg = plc.aggregation.max() + + return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) raise NotImplementedError( f"Unimplemented unary function {self.name=}" ) # pragma: no cover; init trips first def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" + if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: + raise NotImplementedError(f"{self.name} in groupby") if depth == 1: # inside aggregation, need to pre-evaluate, groupby # construction has checked that we don't have nested aggs, @@ -1187,11 +1471,7 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) - if not ( - plc.traits.is_fixed_width(self.dtype) - and plc.traits.is_fixed_width(value.dtype) - and plc.unary.is_supported_cast(value.dtype, self.dtype) - ): + if not dtypes.can_cast(value.dtype, self.dtype): raise NotImplementedError( f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" ) @@ -1255,6 +1535,13 @@ def __init__( req = plc.aggregation.variance(ddof=options) elif name == "count": req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + elif name == "quantile": + _, quantile = self.children + if not isinstance(quantile, Literal): + raise NotImplementedError("Only support literal quantile values") + req = plc.aggregation.quantile( + quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] + ) else: raise NotImplementedError( f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" @@ -1286,9 +1573,18 @@ def __init__( "count", "std", "var", + "quantile", ] ) + interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { + "nearest": plc.types.Interpolation.NEAREST, + "higher": plc.types.Interpolation.HIGHER, + "lower": plc.types.Interpolation.LOWER, + "midpoint": plc.types.Interpolation.MIDPOINT, + "linear": plc.types.Interpolation.LINEAR, + } + def collect_agg(self, *, depth: int) -> AggInfo: """Collect information about aggregations in groupbys.""" if depth >= 1: @@ -1299,7 +1595,19 @@ def collect_agg(self, *, depth: int) -> AggInfo: raise NotImplementedError("Nan propagation in groupby for min/max") (child,) = self.children ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests - if self.request is None: + request = self.request + # These are handled specially here because we don't set up the + # request for the whole-frame agg because we can avoid a + # reduce for these. + if self.name == "first": + request = plc.aggregation.nth_element( + 0, null_handling=plc.types.NullPolicy.INCLUDE + ) + elif self.name == "last": + request = plc.aggregation.nth_element( + -1, null_handling=plc.types.NullPolicy.INCLUDE + ) + if request is None: raise NotImplementedError( f"Aggregation {self.name} in groupby" ) # pragma: no cover; __init__ trips first @@ -1308,7 +1616,7 @@ def collect_agg(self, *, depth: int) -> AggInfo: # Ignore nans in these groupby aggs, do this by masking # nans in the input expr = UnaryFunction(self.dtype, "mask_nans", (), expr) - return AggInfo([(expr, self.request, self)]) + return AggInfo([(expr, request, self)]) def _reduce( self, column: Column, *, request: plc.aggregation.Aggregation @@ -1380,7 +1688,10 @@ def do_evaluate( raise NotImplementedError( f"Agg in context {context}" ) # pragma: no cover; unreachable - (child,) = self.children + + # Aggregations like quantiles may have additional children that were + # preprocessed into pylibcudf requests. + child = self.children[0] return self.op(child.evaluate(df, context=context, mapping=mapping)) @@ -1425,6 +1736,11 @@ def __init__( right: Expr, ) -> None: super().__init__(dtype) + if plc.traits.is_boolean(self.dtype): + # For boolean output types, bitand and bitor implement + # boolean logic, so translate. bitxor also does, but the + # default behaviour is correct. + op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) self.op = op self.children = (left, right) if not plc.binaryop.is_supported_operation( @@ -1436,6 +1752,15 @@ def __init__( f"with output type {self.dtype.id().name}" ) + _BOOL_KLEENE_MAPPING: ClassVar[ + dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] + ] = { + plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + } + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index e334e6f5cc5..8cd56c8ee3a 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -15,7 +15,6 @@ import dataclasses import itertools -import types from functools import cache from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar @@ -28,7 +27,7 @@ import cudf_polars.dsl.expr as expr from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import sorting +from cudf_polars.utils import dtypes, sorting if TYPE_CHECKING: from collections.abc import Callable, MutableMapping @@ -133,8 +132,7 @@ class IR: def __post_init__(self): """Validate preconditions.""" - if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()): - raise NotImplementedError("Cannot make empty columns.") + pass # noqa: PIE790 def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ @@ -189,32 +187,42 @@ class Scan(IR): """Cloud-related authentication options, currently ignored.""" paths: list[str] """List of paths to read from.""" - file_options: Any - """Options for reading the file. - - Attributes are: - - ``with_columns: list[str]`` of projected columns to return. - - ``n_rows: int``: Number of rows to read. - - ``row_index: tuple[name, offset] | None``: Add an integer index - column with given name. - """ + with_columns: list[str] + """Projected columns to return.""" + skip_rows: int + """Rows to skip at the start when reading.""" + n_rows: int + """Number of rows to read after skipping.""" + row_index: tuple[str, int] | None + """If not None add an integer index column of the given name.""" predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.typ not in ("csv", "parquet", "ndjson"): # pragma: no cover # This line is unhittable ATM since IPC/Anonymous scan raise # on the polars side raise NotImplementedError(f"Unhandled scan type: {self.typ}") - if self.typ == "ndjson" and self.file_options.n_rows is not None: - raise NotImplementedError("row limit in scan") + if self.typ == "ndjson" and (self.n_rows != -1 or self.skip_rows != 0): + raise NotImplementedError("row limit in scan for json reader") + if self.skip_rows < 0: + # TODO: polars has this implemented for parquet, + # maybe we can do this too? + raise NotImplementedError("slice pushdown for negative slices") + if self.typ == "csv" and self.skip_rows != 0: # pragma: no cover + # This comes from slice pushdown, but that + # optimization doesn't happen right now + raise NotImplementedError("skipping rows in CSV reader") if self.cloud_options is not None and any( self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp") ): raise NotImplementedError( "Read from cloud storage" ) # pragma: no cover; no test yet + if any(p.startswith("https://") for p in self.paths): + raise NotImplementedError("Read from https") if self.typ == "csv": if self.reader_options["skip_rows_after_header"] != 0: raise NotImplementedError("Skipping rows after header in CSV reader") @@ -242,13 +250,21 @@ def __post_init__(self) -> None: raise NotImplementedError( "ignore_errors is not supported in the JSON reader" ) + elif ( + self.typ == "parquet" + and self.row_index is not None + and self.with_columns is not None + and len(self.with_columns) == 0 + ): + raise NotImplementedError( + "Reading only parquet metadata to produce row index." + ) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - options = self.file_options - with_columns = options.with_columns - row_index = options.row_index - nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1 + with_columns = self.with_columns + row_index = self.row_index + n_rows = self.n_rows if self.typ == "csv": parse_options = self.reader_options["parse_options"] sep = chr(parse_options["separator"]) @@ -256,7 +272,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: eol = chr(parse_options["eol_char"]) if self.reader_options["schema"] is not None: # Reader schema provides names - column_names = list(self.reader_options["schema"]["inner"].keys()) + column_names = list(self.reader_options["schema"]["fields"].keys()) else: # file provides column names column_names = None @@ -282,6 +298,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # polars skips blank lines at the beginning of the file pieces = [] + read_partial = n_rows != -1 for p in self.paths: skiprows = self.reader_options["skip_rows"] path = Path(p) @@ -303,9 +320,13 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: comment=comment, decimal=decimal, dtypes=self.schema, - nrows=nrows, + nrows=n_rows, ) pieces.append(tbl_w_meta) + if read_partial: + n_rows -= tbl_w_meta.tbl.num_rows() + if n_rows <= 0: + break tables, colnames = zip( *( (piece.tbl, piece.column_names(include_children=False)) @@ -321,7 +342,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: tbl_w_meta = plc.io.parquet.read_parquet( plc.io.SourceInfo(self.paths), columns=with_columns, - nrows=nrows, + nrows=n_rows, + skip_rows=self.skip_rows, ) df = DataFrame.from_table( tbl_w_meta.tbl, @@ -354,12 +376,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: raise NotImplementedError( f"Unhandled scan type: {self.typ}" ) # pragma: no cover; post init trips first - if ( - row_index is not None - # TODO: remove condition when dropping support for polars 1.0 - # https://github.com/pola-rs/polars/pull/17363 - and row_index[0] in self.schema - ): + if row_index is not None: name, offset = row_index dtype = self.schema[name] step = plc.interop.from_arrow( @@ -481,36 +498,6 @@ def evaluate( return DataFrame(columns) -def placeholder_column(n: int) -> plc.Column: - """ - Produce a placeholder pylibcudf column with NO BACKING DATA. - - Parameters - ---------- - n - Number of rows the column will advertise - - Returns - ------- - pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER. - - Notes - ----- - This is used to avoid allocating data for count aggregations. - """ - return plc.Column( - plc.DataType(plc.TypeId.INT8), - n, - plc.gpumemoryview( - types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)}) - ), - None, - 0, - 0, - [], - ) - - @dataclasses.dataclass class GroupBy(IR): """Perform a groupby.""" @@ -557,8 +544,7 @@ def check_agg(agg: expr.Expr) -> int: def __post_init__(self) -> None: """Check whether all the aggregations are implemented.""" - if self.options.rolling is None and self.maintain_order: - raise NotImplementedError("Maintaining order in groupby") + super().__post_init__() if self.options.rolling: raise NotImplementedError( "rolling window/groupby" @@ -566,6 +552,8 @@ def __post_init__(self) -> None: if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + if len(self.keys) == 0: + raise NotImplementedError("dynamic groupby") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -591,7 +579,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: for info in self.agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: - col = placeholder_column(df.num_rows) + # A count aggregation, doesn't touch the column, + # but we need to have one. Rather than evaluating + # one, just use one of the key columns. + col = keys[0].obj else: col = pre_eval.evaluate(df).obj requests.append(plc.groupby.GroupByRequest(col, [req])) @@ -611,7 +602,34 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: results = [ req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests ] - return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice) + broadcasted = broadcast(*result_keys, *results) + result_keys = broadcasted[: len(result_keys)] + results = broadcasted[len(result_keys) :] + # Handle order preservation of groups + # like cudf classic does + # https://github.com/rapidsai/cudf/blob/5780c4d8fb5afac2e04988a2ff5531f94c22d3a3/python/cudf/cudf/core/groupby/groupby.py#L723-L743 + if self.maintain_order and not sorted: + left = plc.stream_compaction.stable_distinct( + plc.Table([k.obj for k in keys]), + list(range(group_keys.num_columns())), + plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + right = plc.Table([key.obj for key in result_keys]) + _, indices = plc.join.left_join(left, right, plc.types.NullEquality.EQUAL) + ordered_table = plc.copying.gather( + plc.Table([col.obj for col in broadcasted]), + indices, + plc.copying.OutOfBoundsPolicy.DONT_CHECK, + ) + broadcasted = [ + NamedColumn(reordered, b.name) + for reordered, b in zip( + ordered_table.columns(), broadcasted, strict=True + ) + ] + return DataFrame(broadcasted).slice(self.options.slice) @dataclasses.dataclass @@ -627,7 +645,7 @@ class Join(IR): right_on: list[expr.NamedExpr] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], bool, tuple[int, int] | None, str | None, @@ -644,6 +662,7 @@ class Join(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -653,7 +672,7 @@ def __post_init__(self) -> None: @staticmethod @cache def _joiners( - how: Literal["inner", "left", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -663,7 +682,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) - elif how == "left": + elif how == "left" or how == "right": return ( plc.join.left_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -687,8 +706,7 @@ def _joiners( plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - else: - assert_never(how) + assert_never(how) def _reorder_maps( self, @@ -786,8 +804,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: table = plc.copying.gather(left.table, lg, left_policy) result = DataFrame.from_table(table, left.column_names) else: + if how == "right": + # Right join is a left join with the tables swapped + left, right = right, left + left_on, right_on = right_on, left_on lg, rg = join_fn(left_on.table, right_on.table, null_equality) - if how == "left": + if how == "left" or how == "right": # Order of left table is preserved lg, rg = self._reorder_maps( left.num_rows, lg, left_policy, right.num_rows, rg, right_policy @@ -815,6 +837,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) ) right = right.discard_columns(right_on.column_names_set) + if how == "right": + # Undo the swap for right join before gluing together. + left, right = right, left right = right.rename_columns( { name: f"{name}{suffix}" @@ -1065,11 +1090,13 @@ class MapFunction(IR): # "merge_sorted", "rename", "explode", + "unpivot", ] ) def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") if self.name == "explode": @@ -1086,6 +1113,22 @@ def __post_init__(self) -> None: set(new) & (set(self.df.schema.keys() - set(old))) ): raise NotImplementedError("Duplicate new names in rename.") + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + value_name = "value" if value_name is None else value_name + variable_name = "variable" if variable_name is None else variable_name + if len(pivotees) == 0: + index = frozenset(indices) + pivotees = [name for name in self.df.schema if name not in index] + if not all( + dtypes.can_cast(self.df.schema[p], self.schema[value_name]) + for p in pivotees + ): + raise NotImplementedError( + "Unpivot cannot cast all input columns to " + f"{self.schema[value_name].id()}" + ) + self.options = (indices, pivotees, variable_name, value_name) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" @@ -1107,6 +1150,41 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return DataFrame.from_table( plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) + elif self.name == "unpivot": + indices, pivotees, variable_name, value_name = self.options + npiv = len(pivotees) + df = self.df.evaluate(cache=cache) + index_columns = [ + NamedColumn(col, name) + for col, name in zip( + plc.reshape.tile(df.select(indices).table, npiv).columns(), + indices, + strict=True, + ) + ] + (variable_column,) = plc.filling.repeat( + plc.Table( + [ + plc.interop.from_arrow( + pa.array( + pivotees, + type=plc.interop.to_arrow(self.schema[variable_name]), + ), + ) + ] + ), + df.num_rows, + ).columns() + value_column = plc.concatenate.concatenate( + [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + ) + return DataFrame( + [ + *index_columns, + NamedColumn(variable_column, variable_name), + NamedColumn(value_column, value_name), + ] + ) else: raise AssertionError("Should never be reached") # pragma: no cover @@ -1122,6 +1200,7 @@ class Union(IR): def __post_init__(self) -> None: """Validate preconditions.""" + super().__post_init__() schema = self.dfs[0].schema if not all(s.schema == schema for s in self.dfs[1:]): raise NotImplementedError("Schema mismatch") diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 6dc97c7cb51..a0291037f01 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -75,13 +75,12 @@ def _translate_ir( def _( node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.PythonScan( - schema, - node.options, - translate_named_expr(visitor, n=node.predicate) - if node.predicate is not None - else None, + scan_fn, with_columns, source_type, predicate, nrows = node.options + options = (scan_fn, with_columns, source_type, nrows) + predicate = ( + translate_named_expr(visitor, n=predicate) if predicate is not None else None ) + return ir.PythonScan(schema, options, predicate) @_translate_ir.register @@ -94,13 +93,27 @@ def _( cloud_options = None else: reader_options, cloud_options = map(json.loads, options) + file_options = node.file_options + with_columns = file_options.with_columns + n_rows = file_options.n_rows + if n_rows is None: + n_rows = -1 # All rows + skip_rows = 0 # Don't skip + else: + # TODO: with versioning, rename on the rust side + skip_rows, n_rows = n_rows + + row_index = file_options.row_index return ir.Scan( schema, typ, reader_options, cloud_options, node.paths, - node.file_options, + with_columns, + skip_rows, + n_rows, + row_index, translate_named_expr(visitor, n=node.predicate) if node.predicate is not None else None, @@ -293,10 +306,28 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: ctx: AbstractContextManager[None] = ( set_node(visitor, n) if n is not None else noop_context ) + # IR is versioned with major.minor, minor is bumped for backwards + # compatible changes (e.g. adding new nodes), major is bumped for + # incompatible changes (e.g. renaming nodes). + # Polars 1.7 changes definition of the CSV reader options schema name. + if (version := visitor.version()) >= (3, 0): + raise NotImplementedError( + f"No support for polars IR {version=}" + ) # pragma: no cover; no such version for now. + with ctx: + polars_schema = visitor.get_schema() node = visitor.view_current_node() - schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()} - return _translate_ir(node, visitor, schema) + schema = {k: dtypes.from_polars(v) for k, v in polars_schema.items()} + result = _translate_ir(node, visitor, schema) + if any( + isinstance(dtype, pl.Null) + for dtype in pl.datatypes.unpack_dtypes(*polars_schema.values()) + ): + raise NotImplementedError( + f"No GPU support for {result} with Null column dtype." + ) + return result def translate_named_expr( @@ -345,6 +376,24 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex name, *options = node.function_data options = tuple(options) if isinstance(name, pl_expr.StringFunction): + if name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = (translate_expr(visitor, n=n) for n in node.input) + if isinstance(chars, expr.Literal): + if chars.value == pa.scalar(""): + # No-op in polars, but libcudf uses empty string + # as signifier to remove whitespace. + return column + elif chars.value == pa.scalar(None): + # Polars uses None to mean "strip all whitespace" + chars = expr.Literal( + column.dtype, + pa.scalar("", type=plc.interop.to_arrow(column.dtype)), + ) + return expr.StringFunction(dtype, name, options, column, chars) return expr.StringFunction( dtype, name, @@ -369,19 +418,43 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex *(translate_expr(visitor, n=n) for n in node.input), ) elif isinstance(name, pl_expr.TemporalFunction): - return expr.TemporalFunction( + # functions for which evaluation of the expression may not return + # the same dtype as polars, either due to libcudf returning a different + # dtype, or due to our internal processing affecting what libcudf returns + needs_cast = { + pl_expr.TemporalFunction.Year, + pl_expr.TemporalFunction.Month, + pl_expr.TemporalFunction.Day, + pl_expr.TemporalFunction.WeekDay, + pl_expr.TemporalFunction.Hour, + pl_expr.TemporalFunction.Minute, + pl_expr.TemporalFunction.Second, + pl_expr.TemporalFunction.Millisecond, + } + result_expr = expr.TemporalFunction( dtype, name, options, *(translate_expr(visitor, n=n) for n in node.input), ) + if name in needs_cast: + return expr.Cast(dtype, result_expr) + return result_expr + elif isinstance(name, str): - return expr.UnaryFunction( - dtype, - name, - options, - *(translate_expr(visitor, n=n) for n in node.input), - ) + children = (translate_expr(visitor, n=n) for n in node.input) + if name == "log": + (base,) = options + (child,) = children + return expr.BinOp( + dtype, + plc.binaryop.BinaryOperator.LOG_BASE, + child, + expr.Literal(dtype, pa.scalar(base, type=plc.interop.to_arrow(dtype))), + ) + elif name == "pow": + return expr.BinOp(dtype, plc.binaryop.BinaryOperator.POW, *children) + return expr.UnaryFunction(dtype, name, options, *children) raise NotImplementedError( f"No handler for Expr function node with {name=}" ) # pragma: no cover; polars raises on the rust side for now diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py index d37c96a15de..7b6f3848fc4 100644 --- a/python/cudf_polars/cudf_polars/testing/asserts.py +++ b/python/cudf_polars/cudf_polars/testing/asserts.py @@ -5,12 +5,11 @@ from __future__ import annotations -from functools import partial from typing import TYPE_CHECKING +from polars import GPUEngine from polars.testing.asserts import assert_frame_equal -from cudf_polars.callback import execute_with_cudf from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: @@ -77,21 +76,13 @@ def assert_gpu_result_equal( NotImplementedError If GPU collection failed in some way. """ - if collect_kwargs is None: - collect_kwargs = {} - final_polars_collect_kwargs = collect_kwargs.copy() - final_cudf_collect_kwargs = collect_kwargs.copy() - if polars_collect_kwargs is not None: - final_polars_collect_kwargs.update(polars_collect_kwargs) - if cudf_collect_kwargs is not None: # pragma: no cover - # exclude from coverage since not used ATM - # but this is probably still useful - final_cudf_collect_kwargs.update(cudf_collect_kwargs) - expect = lazydf.collect(**final_polars_collect_kwargs) - got = lazydf.collect( - **final_cudf_collect_kwargs, - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True), + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs ) + + expect = lazydf.collect(**final_polars_collect_kwargs) + engine = GPUEngine(raise_on_fail=True) + got = lazydf.collect(**final_cudf_collect_kwargs, engine=engine) assert_frame_equal( expect, got, @@ -134,3 +125,98 @@ def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e else: raise AssertionError(f"Translation DID NOT RAISE {exceptions}") + + +def _process_kwargs( + collect_kwargs: dict[OptimizationArgs, bool] | None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None, +) -> tuple[dict[OptimizationArgs, bool], dict[OptimizationArgs, bool]]: + if collect_kwargs is None: + collect_kwargs = {} + final_polars_collect_kwargs = collect_kwargs.copy() + final_cudf_collect_kwargs = collect_kwargs.copy() + if polars_collect_kwargs is not None: # pragma: no cover; not currently used + final_polars_collect_kwargs.update(polars_collect_kwargs) + if cudf_collect_kwargs is not None: # pragma: no cover; not currently used + final_cudf_collect_kwargs.update(cudf_collect_kwargs) + return final_polars_collect_kwargs, final_cudf_collect_kwargs + + +def assert_collect_raises( + lazydf: pl.LazyFrame, + *, + polars_except: type[Exception] | tuple[type[Exception], ...], + cudf_except: type[Exception] | tuple[type[Exception], ...], + collect_kwargs: dict[OptimizationArgs, bool] | None = None, + polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None, + cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None, +): + """ + Assert that collecting the result of a query raises the expected exceptions. + + Parameters + ---------- + lazydf + frame to collect. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_except + Exception or exceptions polars CPU is expected to raise. If + None, CPU is not expected to raise an exception. + cudf_except + Exception or exceptions polars GPU is expected to raise. If + None, GPU is not expected to raise an exception. + collect_kwargs + Common keyword arguments to pass to collect for both polars CPU and + cudf-polars. + Useful for controlling optimization settings. + polars_collect_kwargs + Keyword arguments to pass to collect for execution on polars CPU. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + cudf_collect_kwargs + Keyword arguments to pass to collect for execution on cudf-polars. + Overrides kwargs in collect_kwargs. + Useful for controlling optimization settings. + + Returns + ------- + None + If both sides raise the expected exceptions. + + Raises + ------ + AssertionError + If either side did not raise the expected exceptions. + """ + final_polars_collect_kwargs, final_cudf_collect_kwargs = _process_kwargs( + collect_kwargs, polars_collect_kwargs, cudf_collect_kwargs + ) + + try: + lazydf.collect(**final_polars_collect_kwargs) + except polars_except: + pass + except Exception as e: + raise AssertionError( + f"CPU execution RAISED {type(e)}, EXPECTED {polars_except}" + ) from e + else: + if polars_except != (): + raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}") + + engine = GPUEngine(raise_on_fail=True) + try: + lazydf.collect(**final_cudf_collect_kwargs, engine=engine) + except cudf_except: + pass + except Exception as e: + raise AssertionError( + f"GPU execution RAISED {type(e)}, EXPECTED {cudf_except}" + ) from e + else: + if cudf_except != (): + raise AssertionError(f"GPU execution DID NOT RAISE {cudf_except}") diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py new file mode 100644 index 00000000000..05b76d76808 --- /dev/null +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -0,0 +1,158 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Plugin for running polars test suite setting GPU engine as default.""" + +from __future__ import annotations + +from functools import partialmethod +from typing import TYPE_CHECKING + +import pytest + +import polars + +if TYPE_CHECKING: + from collections.abc import Mapping + + +def pytest_addoption(parser: pytest.Parser): + """Add plugin-specific options.""" + group = parser.getgroup( + "cudf-polars", "Plugin to set GPU as default engine for polars tests" + ) + group.addoption( + "--cudf-polars-no-fallback", + action="store_true", + help="Turn off fallback to CPU when running tests (default use fallback)", + ) + + +def pytest_configure(config: pytest.Config): + """Enable use of this module as a pytest plugin to enable GPU collection.""" + no_fallback = config.getoption("--cudf-polars-no-fallback") + collect = polars.LazyFrame.collect + engine = polars.GPUEngine(raise_on_fail=no_fallback) + polars.LazyFrame.collect = partialmethod(collect, engine=engine) + config.addinivalue_line( + "filterwarnings", + "ignore:.*GPU engine does not support streaming or background collection", + ) + config.addinivalue_line( + "filterwarnings", + "ignore:.*Query execution with GPU not supported", + ) + + +EXPECTED_FAILURES: Mapping[str, str] = { + "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", + "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_lazy_count_star.py::test_count_compressed_csv_18057": "Need to determine if file is compressed", + "tests/unit/io/test_lazy_csv.py::test_scan_csv_slice_offset_zero": "Integer overflow in sliced read", + "tests/unit/io/test_lazy_parquet.py::test_dsl2ir_cached_metadata[False]": "cudf-polars doesn't use metadata read by rust preprocessing", + "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", + "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", + "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", + "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-csv-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[glob-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_limit_and_filter[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", + "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", + "tests/unit/lazyframe/test_lazyframe.py::test_cast_frame": "Casting that raises not supported on GPU", + "tests/unit/lazyframe/test_lazyframe.py::test_lazy_cache_hit": "Debug output on stderr doesn't match", + "tests/unit/operations/aggregation/test_aggregations.py::test_duration_function_literal": "Broadcasting inside groupby-agg not supported", + "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", + "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", + "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input11-expected11-input_dtype11-output_dtype11]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input12-expected12-input_dtype12-output_dtype12]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input13-expected13-input_dtype13-output_dtype13]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input14-expected14-input_dtype14-output_dtype14]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype", + "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", + "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", + "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", + "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", + "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_cast.py::test_cast_errors[values2-values::int1-conversion from `i64` to `i8` failed]": "Casting that raises not supported on GPU", + "tests/unit/sql/test_miscellaneous.py::test_read_csv": "Incorrect handling of missing_is_null in read_csv", + "tests/unit/sql/test_wildcard_opts.py::test_select_wildcard_errors": "Raises correctly but with different exception", + "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", + "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", + "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + # Maybe flaky, order-dependent? + "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", + "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", +} + + +def pytest_collection_modifyitems( + session: pytest.Session, config: pytest.Config, items: list[pytest.Item] +): + """Mark known failing tests.""" + if config.getoption("--cudf-polars-no-fallback"): + # Don't xfail tests if running without fallback + return + for item in items: + if item.nodeid in EXPECTED_FAILURES: + item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index adab10bdded..240b11bdf59 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -84,6 +84,10 @@ def view_expression(self, n: int) -> Expr: """Convert the given expression to python rep.""" ... + def version(self) -> tuple[int, int]: + """The IR version as `(major, minor)`.""" + ... + def set_udf( self, callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame], diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 7f6ea1edfd9..4154a404e98 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -13,7 +13,7 @@ import polars as pl -__all__ = ["from_polars", "downcast_arrow_lists"] +__all__ = ["from_polars", "downcast_arrow_lists", "can_cast"] def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: @@ -45,6 +45,28 @@ def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: return typ +def can_cast(from_: plc.DataType, to: plc.DataType) -> bool: + """ + Can we cast (via :func:`~.pylibcudf.unary.cast`) between two datatypes. + + Parameters + ---------- + from_ + Source datatype + to + Target datatype + + Returns + ------- + True if casting is supported, False otherwise + """ + return ( + plc.traits.is_fixed_width(to) + and plc.traits.is_fixed_width(from_) + and plc.unary.is_supported_cast(from_, to) + ) + + @cache def from_polars(dtype: pl.DataType) -> plc.DataType: """ diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index 9807cffb384..4a7ad6b3cf2 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -12,18 +12,11 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0") -POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1") -POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2") -POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1") -POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0") -POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1") -POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2") - -POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2") -POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1") -POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2") -POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1") - -if POLARS_VERSION < parse("1.0"): # pragma: no cover - raise ImportError("cudf_polars requires py-polars v1.0 or greater.") +POLARS_VERSION_LT_18 = POLARS_VERSION < parse("1.8") + + +def _ensure_polars_version(): + if POLARS_VERSION_LT_18: + raise ImportError( + "cudf_polars requires py-polars v1.8 or greater." + ) # pragma: no cover diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 6cd36136bf8..103ac1a674e 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -15,8 +15,10 @@ You will need: ## Installing polars -We will need to build polars from source. Until things settle down, -live at `HEAD`. +`cudf-polars` works with polars >= 1.3, as long as the internal IR +version doesn't get a major version bump. So `pip install polars>=1.3` +should work. For development, if we're adding things to the polars +side of things, we will need to build polars from source: ```sh git clone https://github.com/pola-rs/polars @@ -59,7 +61,7 @@ The executor for the polars logical plan lives in the cudf repo, in ```sh cd cudf/python/cudf_polars -uv pip install --no-build-isolation --no-deps -e . +pip install --no-build-isolation --no-deps -e . ``` You should now be able to run the tests in the `cudf_polars` package: @@ -69,16 +71,18 @@ pytest -v tests # Executor design -The polars `LazyFrame.collect` functionality offers a -"post-optimization" callback that may be used by a third party library -to replace a node (or more, though we only replace a single node) in the -optimized logical plan with a Python callback that is to deliver the -result of evaluating the plan. This splits the execution of the plan -into two phases. First, a symbolic phase which translates to our -internal representation (IR). Second, an execution phase which executes -using our IR. - -The translation phase receives the a low-level Rust `NodeTraverse` +The polars `LazyFrame.collect` functionality offers configuration of +the engine to use for collection through the `engine` argument. At a +low level, this provides for configuration of a "post-optimization" +callback that may be used by a third party library to replace a node +(or more, though we only replace a single node) in the optimized +logical plan with a Python callback that is to deliver the result of +evaluating the plan. This splits the execution of the plan into two +phases. First, a symbolic phase which translates to our internal +representation (IR). Second, an execution phase which executes using +our IR. + +The translation phase receives the a low-level Rust `NodeTraverser` object which delivers Python representations of the plan nodes (and expressions) one at a time. During translation, we endeavour to raise `NotImplementedError` for any unsupported functionality. This way, if @@ -86,33 +90,60 @@ we can't execute something, we just don't modify the logical plan at all: if we can translate the IR, it is assumed that evaluation will later succeed. -The usage of the cudf-based executor is therefore, at present: +The usage of the cudf-based executor is therefore selected with the +gpu engine: ```python -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect(post_opt_callback=execute_with_cudf) +result = q.collect(engine="gpu") ``` This should either transparently run on the GPU and deliver a polars dataframe, or else fail (but be handled) and just run the normal CPU -execution. +execution. If `POLARS_VERBOSE` is true, then fallback is logged with a +`PerformanceWarning`. -If you want to fail during translation, set the keyword argument -`raise_on_fail` to `True`: +As well as a string argument, the engine can also be specified with a +polars `GPUEngine` object. This allows passing more configuration in. +Currently, the public properties are `device`, to select the device, +and `memory_resource`, to select the RMM memory resource used for +allocations during the collection phase. +For example: ```python -from functools import partial -from cudf_polars.callback import execute_with_cudf +import polars as pl -result = q.collect( - post_opt_callback=partial(execute_with_cudf, raise_on_fail=True) -) +result = q.collect(engine=pl.GPUEngine(device=1, memory_resource=mr)) +``` + +Uses device-1, and the given memory resource. Note that the memory +resource provided _must_ be valid for allocations on the specified +device, no checking is performed. + +For debugging purposes, we can also pass undocumented keyword +arguments, at the moment, `raise_on_fail` is also supported, which +raises, rather than falling back, during translation: + +```python + +result = q.collect(engine=pl.GPUEngine(raise_on_fail=True)) ``` This is mostly useful when writing tests, since in that case we want any failures to propagate, rather than falling back to the CPU mode. +## IR versioning + +On the polars side, the `NodeTraverser` object advertises an internal +version (via `NodeTraverser.version()` as a `(major, minor)` tuple). +`minor` version bumps are for backwards compatible changes (e.g. +exposing new nodes), whereas `major` bumps are for incompatible +changes. We can therefore attempt to detect the IR version +(independently of the polars version) and dispatch, or error +appropriately. This should be done during IR translation in +`translate.py`. + ## Adding a handler for a new plan node Plan node definitions live in `cudf_polars/dsl/ir.py`, these are @@ -175,7 +206,7 @@ around their pylibcudf counterparts. We have four (in 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` a `Column` with an additional name +3. `NamedColumn` (a `Column` with an additional name) 4. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 984b5487b98..268364f72a7 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.0,<1.3", + "polars>=1.8,<1.9", "pylibcudf==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -58,6 +58,9 @@ exclude_also = [ "class .*\\bProtocol\\):", "assert_never\\(" ] +# The cudf_polars test suite doesn't exercise the plugin, so we omit +# it from coverage checks. +omit = ["cudf_polars/testing/plugin.py"] [tool.ruff] line-length = 88 diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 6b470268084..39fb44d55a5 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -9,6 +9,7 @@ import polars as pl from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): @@ -140,3 +141,13 @@ def test_sorted_flags_preserved(with_nulls, nulls_last): assert b.null_order == b_null_order assert c.is_sorted == plc.types.Sorted.NO assert df.flags == gf.to_polars().flags + + +def test_empty_name_roundtrips_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "column_0": [4, 5, 6]}) + assert_gpu_result_equal(df) + + +def test_empty_name_roundtrips_no_overlap(): + df = pl.LazyFrame({"": [1, 2, 3], "b": [4, 5, 6]}) + assert_gpu_result_equal(df) diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 245bde3acab..56055f4c6c2 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -7,15 +7,38 @@ import polars as pl from cudf_polars.dsl import expr -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) -@pytest.fixture(params=sorted(expr.Agg._SUPPORTED)) +@pytest.fixture( + params=[ + # regular aggs from Agg + "min", + "max", + "median", + "n_unique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + # scan aggs from UnaryFunction + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + ] +) def agg(request): return request.param -@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16]) +@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16, pl.Int8, pl.UInt16]) def dtype(request): return request.param @@ -34,6 +57,11 @@ def df(dtype, with_nulls, is_sorted): if is_sorted: values = sorted(values, key=lambda x: -1000 if x is None else x) + if dtype.is_unsigned_integer(): + values = pl.Series(values).abs() + if is_sorted: + values = values.sort() + df = pl.LazyFrame({"a": values}, schema={"a": dtype}) if is_sorted: return df.set_sorted("a") @@ -52,6 +80,51 @@ def test_agg(df, agg): assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) +def test_bool_agg(agg, request): + if agg == "cum_min" or agg == "cum_max": + pytest.skip("Does not apply") + request.applymarker( + pytest.mark.xfail( + condition=agg == "n_unique", + reason="Wrong dtype we get Int32, polars gets UInt32", + ) + ) + df = pl.LazyFrame({"a": [True, False, None, True]}) + expr = getattr(pl.col("a"), agg)() + q = df.select(expr) + + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs) +def test_cum_agg_reverse_unsupported(cum_agg): + df = pl.LazyFrame({"a": [1, 2, 3]}) + expr = getattr(pl.col("a"), cum_agg)(reverse=True) + q = df.select(expr) + + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("q", [0.5, pl.lit(0.5)]) +@pytest.mark.parametrize("interp", ["nearest", "higher", "lower", "midpoint", "linear"]) +def test_quantile(df, q, interp): + expr = pl.col("a").quantile(q, interp) + q = df.select(expr) + + # https://github.com/rapidsai/cudf/issues/15852 + check_dtypes = q.collect_schema()["a"] == pl.Float64 + if not check_dtypes: + with pytest.raises(AssertionError): + assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False) + + +def test_quantile_invalid_q(df): + expr = pl.col("a").quantile(pl.col("a")) + q = df.select(expr) + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max] ) diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py index 97421008669..2347021c40e 100644 --- a/python/cudf_polars/tests/expressions/test_booleanfunction.py +++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py @@ -17,15 +17,11 @@ def has_nulls(request): return request.param -@pytest.mark.parametrize( - "ignore_nulls", - [ - pytest.param( - False, marks=pytest.mark.xfail(reason="No support for Kleene logic") - ), - True, - ], -) +@pytest.fixture(params=[False, True], ids=["include_nulls", "ignore_nulls"]) +def ignore_nulls(request): + return request.param + + def test_booleanfunction_reduction(ignore_nulls): ldf = pl.LazyFrame( { @@ -43,6 +39,25 @@ def test_booleanfunction_reduction(ignore_nulls): assert_gpu_result_equal(query) +@pytest.mark.parametrize("expr", [pl.Expr.any, pl.Expr.all]) +def test_booleanfunction_all_any_kleene(expr, ignore_nulls): + ldf = pl.LazyFrame( + { + "a": [False, None], + "b": [False, False], + "c": [False, True], + "d": [None, False], + "e": pl.Series([None, None], dtype=pl.Boolean()), + "f": [None, True], + "g": [True, False], + "h": [True, None], + "i": [True, True], + } + ) + q = ldf.select(expr(pl.col("*"), ignore_nulls=ignore_nulls)) + assert_gpu_result_equal(q) + + @pytest.mark.parametrize( "expr", [ @@ -54,14 +69,7 @@ def test_booleanfunction_reduction(ignore_nulls): ids=lambda f: f"{f.__name__}()", ) @pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"]) -def test_boolean_function_unary(request, expr, has_nans, has_nulls): - if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan): - request.applymarker( - pytest.mark.xfail( - reason="Need to copy null mask since is_{not_}nan(null) => null" - ) - ) - +def test_boolean_function_unary(expr, has_nans, has_nulls): values: list[float | None] = [1, 2, 3, 4, 5] if has_nans: values[3] = float("nan") @@ -119,9 +127,7 @@ def test_boolean_isbetween(closed, bounds): "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"] ) @pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"]) -def test_boolean_horizontal(request, expr, has_nulls, wide): - if has_nulls: - request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic")) +def test_boolean_horizontal(expr, has_nulls, wide): ldf = pl.LazyFrame( { "a": [False, False, False, False, False, True], @@ -164,6 +170,18 @@ def test_boolean_is_in(expr): assert_gpu_result_equal(q) +@pytest.mark.parametrize("expr", [pl.Expr.and_, pl.Expr.or_, pl.Expr.xor]) +def test_boolean_kleene_logic(expr): + ldf = pl.LazyFrame( + { + "a": [False, False, False, None, None, None, True, True, True], + "b": [False, None, True, False, None, True, False, None, True], + } + ) + q = ldf.select(expr(pl.col("a"), pl.col("b"))) + assert_gpu_result_equal(q) + + def test_boolean_is_in_raises_unsupported(): ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)}) q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32()))) diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py index 218101bf87c..c6ea29ddd38 100644 --- a/python/cudf_polars/tests/expressions/test_datetime_basic.py +++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py @@ -9,7 +9,11 @@ import polars as pl -from cudf_polars.testing.asserts import assert_gpu_result_equal +from cudf_polars.dsl.expr import TemporalFunction +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) @pytest.mark.parametrize( @@ -37,26 +41,97 @@ def test_datetime_dataframe_scan(dtype): assert_gpu_result_equal(query) +datetime_extract_fields = [ + "year", + "month", + "day", + "weekday", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + + +@pytest.fixture( + ids=datetime_extract_fields, + params=[methodcaller(f) for f in datetime_extract_fields], +) +def field(request): + return request.param + + +def test_datetime_extract(field): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + q = ldf.select(field(pl.col("datetimes").dt)) + + assert_gpu_result_equal(q) + + +def test_datetime_extra_unsupported(monkeypatch): + ldf = pl.LazyFrame( + { + "datetimes": pl.datetime_range( + datetime.datetime(2020, 1, 1), + datetime.datetime(2021, 12, 30), + "3mo14h15s11ms33us999ns", + eager=True, + ) + } + ) + + def unsupported_name_setter(self, value): + pass + + def unsupported_name_getter(self): + return "unsupported" + + monkeypatch.setattr( + TemporalFunction, + "name", + property(unsupported_name_getter, unsupported_name_setter), + ) + + q = ldf.select(pl.col("datetimes").dt.nanosecond()) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "field", [ methodcaller("year"), - pytest.param( - methodcaller("day"), - marks=pytest.mark.xfail(reason="day extraction not implemented"), - ), + methodcaller("month"), + methodcaller("day"), + methodcaller("weekday"), ], ) -def test_datetime_extract(field): +def test_date_extract(field): + ldf = pl.LazyFrame( + { + "dates": [ + datetime.date(2024, 1, 1), + datetime.date(2024, 10, 11), + ] + } + ) + ldf = pl.LazyFrame( {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]} ) - q = ldf.select(field(pl.col("dates").dt)) - with pytest.raises(AssertionError): - # polars produces int32, libcudf produces int16 for the year extraction - # libcudf can lose data here. - # https://github.com/rapidsai/cudf/issues/16196 - assert_gpu_result_equal(q) + q = ldf.select(field(pl.col("dates").dt)) - assert_gpu_result_equal(q, check_dtypes=False) + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py index 6bffa3e252c..f7c5d1bf2cd 100644 --- a/python/cudf_polars/tests/expressions/test_gather.py +++ b/python/cudf_polars/tests/expressions/test_gather.py @@ -6,7 +6,6 @@ import polars as pl -from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import assert_gpu_result_equal @@ -47,4 +46,4 @@ def test_gather_out_of_bounds(negative): query = ldf.select(pl.col("a").gather(pl.col("b"))) with pytest.raises(pl.exceptions.ComputeError): - query.collect(post_opt_callback=execute_with_cudf) + query.collect(engine="gpu") diff --git a/python/cudf_polars/tests/expressions/test_numeric_unaryops.py b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py new file mode 100644 index 00000000000..ac3aecf88e6 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_numeric_unaryops.py @@ -0,0 +1,91 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import numpy as np +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture( + params=[ + "sin", + "cos", + "tan", + "arcsin", + "arccos", + "arctan", + "sinh", + "cosh", + "tanh", + "arcsinh", + "arccosh", + "arctanh", + "exp", + "sqrt", + "cbrt", + "ceil", + "floor", + "abs", + ] +) +def op(request): + return request.param + + +@pytest.fixture(params=[pl.Int32, pl.Float32]) +def dtype(request): + return request.param + + +@pytest.fixture +def ldf(with_nulls, dtype): + values = [1, 2, 4, 5, -2, -4, 0] + if with_nulls: + values.append(None) + if dtype == pl.Float32: + values.append(-float("inf")) + values.append(float("nan")) + values.append(float("inf")) + elif dtype == pl.Int32: + iinfo = np.iinfo("int32") + values.append(iinfo.min) + values.append(iinfo.max) + return pl.LazyFrame( + { + "a": pl.Series(values, dtype=dtype), + "b": pl.Series([i - 4 for i in range(len(values))], dtype=pl.Float32), + } + ) + + +def test_unary(ldf, op): + expr = getattr(pl.col("a"), op)() + q = ldf.select(expr) + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("base_literal", [False, True]) +@pytest.mark.parametrize("exponent_literal", [False, True]) +def test_pow(ldf, base_literal, exponent_literal): + base = pl.lit(2) if base_literal else pl.col("a") + exponent = pl.lit(-3, dtype=pl.Float32) if exponent_literal else pl.col("b") + + q = ldf.select(base.pow(exponent)) + + assert_gpu_result_equal(q, check_exact=False) + + +@pytest.mark.parametrize("natural", [True, False]) +def test_log(ldf, natural): + if natural: + expr = pl.col("a").log() + else: + expr = pl.col("a").log(10) + + q = ldf.select(expr) + + assert_gpu_result_equal(q, check_exact=False) diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py index df08e15baa4..4f6850ac977 100644 --- a/python/cudf_polars/tests/expressions/test_stringfunction.py +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -10,6 +10,7 @@ from cudf_polars import execute_with_cudf from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -152,3 +153,187 @@ def test_slice_column(slice_column_data): else: query = slice_column_data.select(pl.col("a").str.slice(pl.col("start"))) assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.fixture +def to_datetime_data(): + return pl.LazyFrame( + { + "a": [ + "2021-01-01", + "2021-01-02", + "abcd", + ] + } + ) + + +@pytest.mark.parametrize("cache", [True, False], ids=lambda cache: f"{cache=}") +@pytest.mark.parametrize("strict", [True, False], ids=lambda strict: f"{strict=}") +@pytest.mark.parametrize("exact", [True, False], ids=lambda exact: f"{exact=}") +@pytest.mark.parametrize("format", ["%Y-%m-%d", None], ids=lambda format: f"{format=}") +def test_to_datetime(to_datetime_data, cache, strict, format, exact): + query = to_datetime_data.select( + pl.col("a").str.strptime( + pl.Datetime("ns"), format=format, cache=cache, strict=strict, exact=exact + ) + ) + if cache or format is None or not exact: + assert_ir_translation_raises(query, NotImplementedError) + elif strict: + assert_collect_raises( + query, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.ComputeError, + ) + else: + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target, repl", + [("a", "a"), ("Wı", "☺"), ("FG", ""), ("doesnotexist", "blahblah")], # noqa: RUF001 +) +@pytest.mark.parametrize("n", [0, 3, -1]) +def test_replace_literal(ldf, target, repl, n): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True, n=n)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize("target, repl", [("", ""), ("a", pl.col("a"))]) +def test_replace_literal_unsupported(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace(target, repl, literal=True)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_re(ldf): + query = ldf.select(pl.col("a").str.replace("A", "a", literal=False)) + assert_ir_translation_raises(query, NotImplementedError) + + +@pytest.mark.parametrize( + "target,repl", + [ + (["A", "de", "kLm", "awef"], "a"), + (["A", "de", "kLm", "awef"], ""), + (["A", "de", "kLm", "awef"], ["a", "b", "c", "d"]), + (["A", "de", "kLm", "awef"], ["a", "b", "c", ""]), + ( + pl.lit(pl.Series(["A", "de", "kLm", "awef"])), + pl.lit(pl.Series(["a", "b", "c", "d"])), + ), + ], +) +def test_replace_many(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "target,repl", + [(["A", ""], ["a", "b"]), (pl.col("a").drop_nulls(), pl.col("a").drop_nulls())], +) +def test_replace_many_notimplemented(ldf, target, repl): + query = ldf.select(pl.col("a").str.replace_many(target, repl)) + assert_ir_translation_raises(query, NotImplementedError) + + +def test_replace_many_ascii_case(ldf): + query = ldf.select( + pl.col("a").str.replace_many(["a", "b", "c"], "a", ascii_case_insensitive=True) + ) + + assert_ir_translation_raises(query, NotImplementedError) + + +_strip_data = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", + None, +] + + +@pytest.fixture +def strip_ldf(): + return pl.DataFrame({"a": _strip_data}).lazy() + + +@pytest.fixture(params=strip_chars) +def to_strip(request): + return request.param + + +def test_strip_chars(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_start(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_start(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_end(strip_ldf, to_strip): + q = strip_ldf.select(pl.col("a").str.strip_chars_end(to_strip)) + assert_gpu_result_equal(q) + + +def test_strip_chars_column(strip_ldf): + q = strip_ldf.select(pl.col("a").str.strip_chars(pl.col("a"))) + assert_ir_translation_raises(q, NotImplementedError) + + +def test_invalid_regex_raises(): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(r"ab)", strict=True)) + + assert_collect_raises( + q, + polars_except=pl.exceptions.ComputeError, + cudf_except=pl.exceptions.ComputeError, + ) + + +@pytest.mark.parametrize("pattern", ["a{1000}", "a(?i:B)"]) +def test_unsupported_regex_raises(pattern): + df = pl.LazyFrame({"a": ["abc"]}) + + q = df.select(pl.col("a").str.contains(pattern, strict=True)) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/pytest.ini b/python/cudf_polars/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/cudf_polars/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 5b4bba55552..3c3986be19b 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -6,6 +6,9 @@ import pytest import polars as pl +from polars.testing.asserts import assert_frame_equal + +import rmm from cudf_polars.dsl.ir import IR from cudf_polars.testing.asserts import ( @@ -32,3 +35,48 @@ def raise_unimplemented(self): ): # And ensure that collecting issues the correct warning. assert_gpu_result_equal(q) + + +def test_unsupported_config_raises(): + q = pl.LazyFrame({}) + + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(unknown_key=True)) + + +@pytest.mark.parametrize("device", [-1, "foo"]) +def test_invalid_device_raises(device): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(device=device)) + + +@pytest.mark.parametrize("mr", [1, object()]) +def test_invalid_memory_resource_raises(mr): + q = pl.LazyFrame({}) + with pytest.raises(pl.exceptions.ComputeError): + q.collect(engine=pl.GPUEngine(memory_resource=mr)) + + +def test_explicit_device_zero(): + q = pl.LazyFrame({"a": [1, 2, 3]}) + + result = q.collect(engine=pl.GPUEngine(device=0)) + assert_frame_equal(q.collect(), result) + + +def test_explicit_memory_resource(): + upstream = rmm.mr.CudaMemoryResource() + n_allocations = 0 + + def allocate(bytes, stream): + nonlocal n_allocations + n_allocations += 1 + return upstream.allocate(bytes, stream) + + mr = rmm.mr.CallbackMemoryResource(allocate, upstream.deallocate) + + q = pl.LazyFrame({"a": [1, 2, 3]}) + result = q.collect(engine=pl.GPUEngine(memory_resource=mr)) + assert_frame_equal(q.collect(), result) + assert n_allocations > 0 diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index a75825ef3d3..74bf8b9e4e2 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture @@ -31,6 +30,7 @@ def df(): params=[ [pl.col("key1")], [pl.col("key2")], + [pl.col("key1"), pl.lit(1)], [pl.col("key1") * pl.col("key2")], [pl.col("key1"), pl.col("key2")], [pl.col("key1") == pl.col("key2")], @@ -52,6 +52,7 @@ def keys(request): [(pl.col("float") - pl.lit(2)).max()], [pl.col("float").sum().round(decimals=1)], [pl.col("float").round(decimals=1).sum()], + [pl.col("int").first(), pl.col("float").last()], ], ids=lambda aggs: "-".join(map(str, aggs)), ) @@ -60,15 +61,7 @@ def exprs(request): @pytest.fixture( - params=[ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="Maintaining order in groupby not implemented" - ), - ), - ], + params=[False, True], ids=["no_maintain_order", "maintain_order"], ) def maintain_order(request): @@ -98,15 +91,10 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs): # Multiple keys don't do sorting qsorted = q.sort(*sort_keys) if len(keys) > 1: - with pytest.raises(AssertionError): - # https://github.com/pola-rs/polars/issues/17556 - assert_gpu_result_equal(q, check_exact=False) - if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean(): - # https://github.com/pola-rs/polars/issues/17557 - with pytest.raises(AssertionError): - assert_gpu_result_equal(qsorted, check_exact=False) - else: - assert_gpu_result_equal(qsorted, check_exact=False) + # https://github.com/pola-rs/polars/issues/17556 + # Can't assert that the query without post-sorting fails, + # since it _might_ pass. + assert_gpu_result_equal(qsorted, check_exact=False) elif schema[sort_keys[0]] == pl.Boolean(): # Boolean keys don't do sorting, so we get random order assert_gpu_result_equal(qsorted, check_exact=False) @@ -133,6 +121,21 @@ def test_groupby_unsupported(df, expr): assert_ir_translation_raises(q, NotImplementedError) +def test_groupby_null_keys(maintain_order): + df = pl.LazyFrame( + { + "key": pl.Series([1, float("nan"), 2, None, 2, None], dtype=pl.Float64()), + "value": [-1, 2, 1, 2, 3, 4], + } + ) + + q = df.group_by("key", maintain_order=maintain_order).agg(pl.col("value").min()) + if not maintain_order: + q = q.sort("key") + + assert_gpu_result_equal(q) + + @pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513") def test_groupby_minmax_with_nan(): df = pl.LazyFrame( @@ -159,21 +162,17 @@ def test_groupby_nan_minmax_raises(op): @pytest.mark.parametrize( "key", - [ - pytest.param( - 1, - marks=pytest.mark.xfail( - versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this" - ), - ), - pl.col("key1"), - ], + [1, pl.col("key1")], ) @pytest.mark.parametrize( "expr", [ pl.lit(1).alias("value"), - pl.lit([[4, 5, 6]]).alias("value"), + pytest.param( + pl.lit([[4, 5, 6]]).alias("value"), + marks=pytest.mark.xfail(reason="Need to expose OtherScalar in rust IR"), + ), + pl.Series("value", [[4, 5, 6]], dtype=pl.List(pl.Int32)), pl.col("float") * (1 - pl.col("int")), [pl.lit(2).alias("value"), pl.col("float") * 2], ], @@ -183,3 +182,12 @@ def test_groupby_literal_in_agg(df, key, expr): # so just sort by the group key q = df.group_by(key).agg(expr).sort(key, maintain_order=True) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "expr", + [pl.col("int").unique(), pl.col("int").drop_nulls(), pl.col("int").cum_max()], +) +def test_groupby_unary_non_pointwise_raises(df, expr): + q = df.group_by("key1").agg(expr) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_groupby_dynamic.py b/python/cudf_polars/tests/test_groupby_dynamic.py new file mode 100644 index 00000000000..38b3ce74ac5 --- /dev/null +++ b/python/cudf_polars/tests/test_groupby_dynamic.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from datetime import datetime + +import polars as pl + +from cudf_polars.testing.asserts import assert_ir_translation_raises + + +def test_groupby_dynamic_raises(): + df = pl.LazyFrame( + { + "dt": [ + datetime(2021, 12, 31, 0, 0, 0), + datetime(2022, 1, 1, 0, 0, 1), + datetime(2022, 3, 31, 0, 0, 1), + datetime(2022, 4, 1, 0, 0, 1), + ] + } + ) + + q = ( + df.sort("dt") + .group_by_dynamic("dt", every="1q") + .agg(pl.col("dt").count().alias("num_values")) + ) + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 1e880cdc6de..7d9ec98db97 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -17,7 +17,7 @@ def join_nulls(request): return request.param -@pytest.fixture(params=["inner", "left", "semi", "anti", "full"]) +@pytest.fixture(params=["inner", "left", "right", "semi", "anti", "full"]) def how(request): return request.param diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py index 77032108e6f..e895f27f637 100644 --- a/python/cudf_polars/tests/test_mapfunction.py +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -61,3 +61,48 @@ def test_rename_columns(mapping): q = df.rename(mapping) assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("index", [None, ["a"], ["d", "a"]]) +@pytest.mark.parametrize("variable_name", [None, "names"]) +@pytest.mark.parametrize("value_name", [None, "unpivoted"]) +def test_unpivot(index, variable_name, value_name): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot( + ["c", "b"], index=index, variable_name=variable_name, value_name=value_name + ) + + assert_gpu_result_equal(q) + + +def test_unpivot_defaults(): + df = pl.LazyFrame( + { + "a": pl.Series([11, 12, 13], dtype=pl.UInt16), + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + "c": pl.Series([2, 4, 6], dtype=pl.Float32), + "d": ["a", "b", "c"], + } + ) + q = df.unpivot(index="d") + assert_gpu_result_equal(q) + + +def test_unpivot_unsupported_cast_raises(): + df = pl.LazyFrame( + { + "a": ["x", "y", "z"], + "b": pl.Series([1, 3, 5], dtype=pl.Int16), + } + ) + + q = df.unpivot(["a", "b"]) + + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py index fd8453b77c4..0cda89474a8 100644 --- a/python/cudf_polars/tests/test_python_scan.py +++ b/python/cudf_polars/tests/test_python_scan.py @@ -8,7 +8,9 @@ def test_python_scan(): - def source(with_columns, predicate, nrows): + def source(with_columns, predicate, nrows, *batch_size): + # PythonScan interface changes between 1.3 and 1.4 to add an + # extra batch_size argument return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())}) q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False) diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py index 64acbb076ed..792b136acd8 100644 --- a/python/cudf_polars/tests/test_scan.py +++ b/python/cudf_polars/tests/test_scan.py @@ -12,7 +12,6 @@ assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils import versions @pytest.fixture( @@ -58,6 +57,22 @@ def mask(request): return request.param +@pytest.fixture( + params=[ + None, + (1, 1), + ], + ids=[ + "no-slice", + "slice-second", + ], +) +def slice(request): + # For use in testing that we handle + # polars slice pushdown correctly + return request.param + + def make_source(df, path, format): """ Writes the passed polars df to a file of @@ -79,7 +94,9 @@ def make_source(df, path, format): ("parquet", pl.scan_parquet), ], ) -def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request): +def test_scan( + tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, slice, request +): name, offset = row_index make_source(df, tmp_path / "file", format) request.applymarker( @@ -94,21 +111,23 @@ def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, r row_index_offset=offset, n_rows=n_rows, ) + if slice is not None: + q = q.slice(*slice) if mask is not None: q = q.filter(mask) if columns is not None: q = q.select(*columns) - polars_collect_kwargs = {} - if versions.POLARS_VERSION_LT_12: - # https://github.com/pola-rs/polars/issues/17553 - polars_collect_kwargs = {"projection_pushdown": False} - assert_gpu_result_equal( - q, - polars_collect_kwargs=polars_collect_kwargs, - # This doesn't work in polars < 1.2 since the row-index - # is in the wrong order in previous polars releases - check_column_order=versions.POLARS_VERSION_LT_12, - ) + assert_gpu_result_equal(q) + + +def test_negative_slice_pushdown_raises(tmp_path): + df = pl.DataFrame({"a": [1, 2, 3]}) + + df.write_parquet(tmp_path / "df.parquet") + q = pl.scan_parquet(tmp_path / "df.parquet") + # Take the last row + q = q.slice(-1, 1) + assert_ir_translation_raises(q, NotImplementedError) def test_scan_unsupported_raises(tmp_path): @@ -127,10 +146,6 @@ def test_scan_ndjson_nrows_notimplemented(tmp_path, df): assert_ir_translation_raises(q, NotImplementedError) -@pytest.mark.xfail( - versions.POLARS_VERSION_LT_11, - reason="https://github.com/pola-rs/polars/issues/15730", -) def test_scan_row_index_projected_out(tmp_path): df = pl.DataFrame({"a": [1, 2, 3]}) @@ -169,15 +184,25 @@ def test_scan_csv_column_renames_projection_schema(tmp_path): ("test*.csv", False), ], ) -def test_scan_csv_multi(tmp_path, filename, glob): +@pytest.mark.parametrize( + "nrows_skiprows", + [ + (None, 0), + (1, 1), + (3, 0), + (4, 2), + ], +) +def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows): + n_rows, skiprows = nrows_skiprows with (tmp_path / "test1.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test2.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") with (tmp_path / "test*.csv").open("w") as f: - f.write("""foo,bar,baz\n1,2\n3,4,5""") + f.write("""foo,bar,baz\n1,2,3\n3,4,5""") os.chdir(tmp_path) - q = pl.scan_csv(filename, glob=glob) + q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows) assert_gpu_result_equal(q) @@ -280,3 +305,24 @@ def test_scan_ndjson_unsupported(df, tmp_path): make_source(df, tmp_path / "file", "ndjson") q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True) assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_nested_null_raises(tmp_path): + df = pl.DataFrame({"a": pl.Series([None], dtype=pl.List(pl.Null))}) + + df.write_parquet(tmp_path / "file.pq") + + q = pl.scan_parquet(tmp_path / "file.pq") + + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_parquet_only_row_index_raises(df, tmp_path): + make_source(df, tmp_path / "file", "parquet") + q = pl.scan_parquet(tmp_path / "file", row_index_name="index").select("index") + assert_ir_translation_raises(q, NotImplementedError) + + +def test_scan_hf_url_raises(): + q = pl.scan_csv("hf://datasets/scikit-learn/iris/Iris.csv") + assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py index ecc02efd967..cfa8e5ff9b9 100644 --- a/python/cudf_polars/tests/test_sort.py +++ b/python/cudf_polars/tests/test_sort.py @@ -13,10 +13,7 @@ "sort_keys", [ (pl.col("a"),), - pytest.param( - (pl.col("d").abs(),), - marks=pytest.mark.xfail(reason="abs not yet implemented"), - ), + (pl.col("d").abs(),), (pl.col("a"), pl.col("d")), (pl.col("b"),), ], diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py index 5bc2fe1efb7..ace1c6b8648 100644 --- a/python/cudf_polars/tests/testing/test_asserts.py +++ b/python/cudf_polars/tests/testing/test_asserts.py @@ -8,6 +8,7 @@ import polars as pl from cudf_polars.testing.asserts import ( + assert_collect_raises, assert_gpu_result_equal, assert_ir_translation_raises, ) @@ -26,10 +27,59 @@ def test_translation_assert_raises(): class E(Exception): pass - unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b")) + unsupported = df.group_by("a").agg(pl.col("a").upper_bound().alias("b")) # Unsupported query should raise NotImplementedError assert_ir_translation_raises(unsupported, NotImplementedError) with pytest.raises(AssertionError): # This should fail, because we can't translate this query, but it doesn't raise E. assert_ir_translation_raises(unsupported, E) + + +def test_collect_assert_raises(): + df = pl.LazyFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + + with pytest.raises(AssertionError, match="CPU execution DID NOT RAISE"): + # This should raise, because polars CPU can run this query, + # but we expect an error. + assert_collect_raises( + df, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=(), + ) + + with pytest.raises(AssertionError, match="GPU execution DID NOT RAISE"): + # This should raise, because polars GPU can run this query, + # but we expect an error. + assert_collect_raises( + df, + polars_except=(), + cudf_except=pl.exceptions.InvalidOperationError, + ) + + # Here's an invalid query that gets caught at IR optimisation time. + q = df.select(pl.col("a") * pl.col("b")) + + # This exception is raised in preprocessing, so is the same for + # both CPU and GPU engines. + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=pl.exceptions.InvalidOperationError, + ) + + with pytest.raises(AssertionError, match="GPU execution RAISED"): + # This should raise because the expected GPU error is wrong + assert_collect_raises( + q, + polars_except=pl.exceptions.InvalidOperationError, + cudf_except=NotImplementedError, + ) + + with pytest.raises(AssertionError, match="CPU execution RAISED"): + # This should raise because the expected CPU error is wrong + assert_collect_raises( + q, + polars_except=NotImplementedError, + cudf_except=pl.exceptions.InvalidOperationError, + ) diff --git a/python/custreamz/README.md b/python/custreamz/README.md index 1509dac9e61..8da17ef09dc 100644 --- a/python/custreamz/README.md +++ b/python/custreamz/README.md @@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids ### Conda -cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel: +cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel: Release: ```bash diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/custreamz/custreamz/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md index 6edb9f87d48..69e1524be39 100644 --- a/python/dask_cudf/README.md +++ b/python/dask_cudf/README.md @@ -1,135 +1,64 @@ #
 Dask cuDF - A GPU Backend for Dask DataFrame
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame. - -## Using Dask cuDF - -### The Dask DataFrame API (Recommended) - -Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically: - -```python -import dask -dask.config.set({"dataframe.backend": "cudf"}) - -import dask.dataframe as dd -# This gives us a cuDF-backed dataframe -df = dd.read_parquet("data.parquet", ...) -``` +Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame. > [!IMPORTANT] -> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function: - -```python -import pandas as pd -import cudf - -# This gives us a Pandas-backed dataframe -dd.from_pandas(pd.DataFrame({"a": range(10)})) - -# This gives us a cuDF-backed dataframe -dd.from_pandas(cudf.DataFrame({"a": range(10)})) -``` - -A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend: - -```python -df = df.to_backend("pandas") -``` - -Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend: - -```python -df = df.to_backend("cudf") -``` - -### The Explicit Dask cuDF API - -In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API: - -```python -import dask_cudf - -# This always gives us a cuDF-backed dataframe -df = dask_cudf.read_parquet("data.parquet", ...) -``` - -> [!NOTE] -> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section). +> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently. -See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information. - -## Query Planning - -Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood. - -For example, the following user code will automatically benefit from predicate pushdown when the result is computed. - -```python -df = dd.read_parquet("/my/parquet/dataset/") -result = df.sort_values('B')['A'] -``` - -Unoptimized expression graph (`df.pprint()`): -``` -Projection: columns='A' - SortValues: by=['B'] shuffle_method='tasks' options={} - ReadParquetFSSpec: path='/my/parquet/dataset/' ... -``` +## Using Dask cuDF -Simplified expression graph (`df.simplify().pprint()`): -``` -Projection: columns='A' - SortValues: by=['B'] shuffle_method='tasks' options={} - ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ... -``` +Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF. -> [!NOTE] -> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself. +## Installation +See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages. -## Using Multiple GPUs and Multiple Nodes +## Resources -Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster. +- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/) +- [Best practices](https://docs.rapids.ai/api/dask-cudf/stable/best_practices/) +- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/) +- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) +- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) +- [Deployment](https://docs.rapids.ai/deployment/stable/) +- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate. -> [!IMPORTANT] -> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs. +### Quick-start example -In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example: +A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern: ```python - +import dask +import dask.dataframe as dd from dask_cuda import LocalCUDACluster from distributed import Client -client = Client( +if __name__ == "__main__": + + # Define a GPU-aware cluster to leverage multiple GPUs + client = Client( LocalCUDACluster( - CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) - rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations - enable_cudf_spill=True, # Improve device memory stability - local_directory="/fast/scratch/", # Use fast local storage for spilling + CUDA_VISIBLE_DEVICES="0,1", # Use two workers (on devices 0 and 1) + rmm_pool_size=0.9, # Use 90% of GPU memory as a pool for faster allocations + enable_cudf_spill=True, # Improve device memory stability + local_directory="/fast/scratch/", # Use fast local storage for spilling ) -) + ) -df = dd.read_parquet("/my/parquet/dataset/") -agg = df.groupby('B').sum() -agg.compute() # This will use the cluster defined above -``` + # Set the default dataframe backend to "cudf" + dask.config.set({"dataframe.backend": "cudf"}) -> [!NOTE] -> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details. + # Create your DataFrame collection from on-disk + # or in-memory data + df = dd.read_parquet("/my/parquet/dataset/") -Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)). + # Use cudf-like syntax to transform and/or query your data + query = df.groupby('item')['price'].mean() -## Install - -See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages. + # Compute, persist, or write out the result + query.head() +``` -## Resources +If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory). -- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) -- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/) -- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/) -- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/) -- [Deployment](https://docs.rapids.ai/deployment/stable/) -- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate. +If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions. diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py index f60e4ff81ef..907abaa2bfc 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/expr/_collection.py @@ -15,6 +15,7 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.typing import no_default import cudf @@ -49,8 +50,24 @@ def to_dask_dataframe(self, **kwargs): return self.to_backend("pandas", **kwargs) + def _prepare_cov_corr(self, min_periods, numeric_only): + # Upstream version of this method sets min_periods + # to 2 by default (which is not supported by cudf) + # TODO: Remove when cudf supports both min_periods + # and numeric_only + # See: https://github.com/rapidsai/cudf/issues/12626 + # See: https://github.com/rapidsai/cudf/issues/9009 + self._meta.cov(min_periods=min_periods) + + frame = self + if numeric_only: + numerics = self._meta._get_numeric_data() + if len(numerics.columns) != len(self.columns): + frame = frame[list(numerics.columns)] + return frame, min_periods + # var can be removed if cudf#15179 is addressed. - # See: https://github.com/rapidsai/cudf/issues/15179 + # See: https://github.com/rapidsai/cudf/issues/14935 def var( self, axis=0, @@ -74,6 +91,17 @@ def var( ) ) + def rename_axis( + self, mapper=no_default, index=no_default, columns=no_default, axis=0 + ): + from dask_cudf.expr._expr import RenameAxisCudf + + return new_collection( + RenameAxisCudf( + self, mapper=mapper, index=index, columns=columns, axis=axis + ) + ) + class DataFrame(DXDataFrame, CudfFrameBase): @classmethod @@ -186,27 +214,58 @@ class Index(DXIndex, CudfFrameBase): ## -try: - from dask_expr._backends import create_array_collection - - @get_collection_type.register_lazy("cupy") - def _register_cupy(): - import cupy - - @get_collection_type.register(cupy.ndarray) - def get_collection_type_cupy_array(_): - return create_array_collection - - @get_collection_type.register_lazy("cupyx") - def _register_cupyx(): - # Needed for cuml - from cupyx.scipy.sparse import spmatrix - - @get_collection_type.register(spmatrix) - def get_collection_type_csr_matrix(_): - return create_array_collection - -except ImportError: - # Older version of dask-expr. - # Implicit conversion to array wont work. - pass +def _create_array_collection_with_meta(expr): + # NOTE: This is the GPU compatible version of + # `new_dd_object` for DataFrame -> Array conversion. + # This can be removed if dask#11017 is resolved + # (See: https://github.com/dask/dask/issues/11017) + import numpy as np + + import dask.array as da + from dask.blockwise import Blockwise + from dask.highlevelgraph import HighLevelGraph + + result = expr.optimize() + dsk = result.__dask_graph__() + name = result._name + meta = result._meta + divisions = result.divisions + chunks = ((np.nan,) * (len(divisions) - 1),) + tuple( + (d,) for d in meta.shape[1:] + ) + if len(chunks) > 1: + if isinstance(dsk, HighLevelGraph): + layer = dsk.layers[name] + else: + # dask-expr provides a dict only + layer = dsk + if isinstance(layer, Blockwise): + layer.new_axes["j"] = chunks[1][0] + layer.output_indices = layer.output_indices + ("j",) + else: + suffix = (0,) * (len(chunks) - 1) + for i in range(len(chunks[0])): + layer[(name, i) + suffix] = layer.pop((name, i)) + + return da.Array(dsk, name=name, chunks=chunks, meta=meta) + + +@get_collection_type.register_lazy("cupy") +def _register_cupy(): + import cupy + + get_collection_type.register( + cupy.ndarray, + lambda _: _create_array_collection_with_meta, + ) + + +@get_collection_type.register_lazy("cupyx") +def _register_cupyx(): + # Needed for cuml + from cupyx.scipy.sparse import spmatrix + + get_collection_type.register( + spmatrix, + lambda _: _create_array_collection_with_meta, + ) diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py index 8a2c50d3fe7..b284ab3774d 100644 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ b/python/dask_cudf/dask_cudf/expr/_expr.py @@ -4,11 +4,12 @@ import dask_expr._shuffle as _shuffle_module from dask_expr import new_collection from dask_expr._cumulative import CumulativeBlockwise -from dask_expr._expr import Elemwise, Expr, VarColumns +from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty from dask.dataframe.dispatch import is_categorical_dtype +from dask.typing import no_default import cudf @@ -17,6 +18,19 @@ ## +class RenameAxisCudf(RenameAxis): + # TODO: Remove this after rename_axis is supported in cudf + # (See: https://github.com/rapidsai/cudf/issues/16895) + @staticmethod + def operation(df, index=no_default, **kwargs): + if index != no_default: + df.index.name = index + return df + raise NotImplementedError( + "Only `index` is supported for the cudf backend" + ) + + class ToCudfBackend(Elemwise): # TODO: Inherit from ToBackend when rapids-dask-dependency # is pinned to dask>=2024.8.1 diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index e793d4381d1..a781b8242fe 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -23,7 +23,6 @@ from cudf.io import write_to_dataset from cudf.io.parquet import _apply_post_filters, _normalize_filters from cudf.utils.dtypes import cudf_dtype_from_pa_type -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT class CudfEngine(ArrowDatasetEngine): @@ -341,9 +340,7 @@ def write_partition( return_metadata=return_metadata, statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT - ), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), row_group_size_rows=kwargs.get("row_group_size_rows", None), max_page_size_bytes=kwargs.get("max_page_size_bytes", None), max_page_size_rows=kwargs.get("max_page_size_rows", None), @@ -365,7 +362,7 @@ def write_partition( statistics=kwargs.get("statistics", "ROWGROUP"), int96_timestamps=kwargs.get("int96_timestamps", False), row_group_size_bytes=kwargs.get( - "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT + "row_group_size_bytes", None ), row_group_size_rows=kwargs.get( "row_group_size_rows", None diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini new file mode 100644 index 00000000000..7b0a9f29fb1 --- /dev/null +++ b/python/dask_cudf/dask_cudf/tests/pytest.ini @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +[pytest] +addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 905d8c08135..5f0fae86691 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -16,6 +16,7 @@ import dask_cudf from dask_cudf.tests.utils import ( + QUERY_PLANNING_ON, require_dask_expr, skip_dask_expr, xfail_dask_expr, @@ -950,12 +951,16 @@ def test_implicit_array_conversion_cupy(): def func(x): return x.values - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 - result = ds.map_partitions(func, meta=s.values).compute() - expect = func(s) + result = ds.map_partitions(func, meta=s.values) - dask.array.assert_eq(result, expect) + if QUERY_PLANNING_ON: + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) + else: + # Legacy version still carries numpy metadata + # See: https://github.com/dask/dask/issues/11017 + dask.array.assert_eq(result.compute(), func(s)) def test_implicit_array_conversion_cupy_sparse(): @@ -967,8 +972,6 @@ def test_implicit_array_conversion_cupy_sparse(): def func(x): return cupyx.scipy.sparse.csr_matrix(x.values) - # Need to compute the dask collection for now. - # See: https://github.com/dask/dask/issues/11017 result = ds.map_partitions(func, meta=s.values).compute() expect = func(s) @@ -1007,3 +1010,32 @@ def test_to_backend_simplify(): df2 = df.to_backend("cudf")[["y"]].simplify() df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify() assert df2._name == df3._name + + +@pytest.mark.parametrize("numeric_only", [True, False]) +@pytest.mark.parametrize("op", ["corr", "cov"]) +def test_cov_corr(op, numeric_only): + df = cudf.DataFrame.from_dict( + { + "x": np.random.randint(0, 5, size=10), + "y": np.random.normal(size=10), + } + ) + ddf = dd.from_pandas(df, npartitions=2) + res = getattr(ddf, op)(numeric_only=numeric_only) + # Use to_pandas until cudf supports numeric_only + # (See: https://github.com/rapidsai/cudf/issues/12626) + expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only) + dd.assert_eq(res, expect) + + +def test_rename_axis_after_join(): + df1 = cudf.DataFrame(index=["a", "b", "c"], data=dict(a=[1, 2, 3])) + df1.index.name = "test" + ddf1 = dd.from_pandas(df1, 2) + + df2 = cudf.DataFrame(index=["a", "b", "d"], data=dict(b=[1, 2, 3])) + ddf2 = dd.from_pandas(df2, 2) + result = ddf1.join(ddf2, how="outer") + expected = df1.join(df2, how="outer") + dd.assert_eq(result, expected, check_index=False) diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index 5a67f4d6cdb..5f9d145139a 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -94,7 +94,7 @@ cpdef bool is_supported_operation( ): """Check if an operation is supported for the given data types. - For details, see :cpp:func::is_supported_operation`. + For details, see :cpp:func::`is_supported_operation`. Parameters ---------- diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index 4601cba515a..e9085e3ea02 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -18,6 +18,20 @@ from .types import MaskState, TypeId cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): + """Creates an empty column of the specified type. + + For details, see :cpp:func::`make_empty_column`. + + Parameters + ---------- + type_or_id : Union[DataType, type_id, object] + The column data type. + + Returns + ------- + Column + An empty Column + """ cdef unique_ptr[column] result cdef type_id id @@ -60,7 +74,11 @@ cpdef Column make_numeric_column( size_type size, MaskArg mstate ): + """Creates an empty numeric column. + + For details, see :cpp:func::`make_numeric_column`. + """ cdef unique_ptr[column] result cdef mask_state state diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index 0ddc68bcb9d..e8e0caaf42d 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -2,7 +2,19 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year +from pylibcudf.libcudf.datetime cimport ( + day_of_year as cpp_day_of_year, + extract_day as cpp_extract_day, + extract_hour as cpp_extract_hour, + extract_microsecond_fraction as cpp_extract_microsecond_fraction, + extract_millisecond_fraction as cpp_extract_millisecond_fraction, + extract_minute as cpp_extract_minute, + extract_month as cpp_extract_month, + extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, + extract_second as cpp_extract_second, + extract_weekday as cpp_extract_weekday, + extract_year as cpp_extract_year, +) from .column cimport Column @@ -28,3 +40,42 @@ cpdef Column extract_year( with nogil: result = move(cpp_extract_year(values.view())) return Column.from_libcudf(move(result)) + + +def extract_datetime_component(Column col, str field): + + cdef unique_ptr[column] c_result + + with nogil: + if field == "year": + c_result = move(cpp_extract_year(col.view())) + elif field == "month": + c_result = move(cpp_extract_month(col.view())) + elif field == "day": + c_result = move(cpp_extract_day(col.view())) + elif field == "weekday": + c_result = move(cpp_extract_weekday(col.view())) + elif field == "hour": + c_result = move(cpp_extract_hour(col.view())) + elif field == "minute": + c_result = move(cpp_extract_minute(col.view())) + elif field == "second": + c_result = move(cpp_extract_second(col.view())) + elif field == "millisecond": + c_result = move( + cpp_extract_millisecond_fraction(col.view()) + ) + elif field == "microsecond": + c_result = move( + cpp_extract_microsecond_fraction(col.view()) + ) + elif field == "nanosecond": + c_result = move( + cpp_extract_nanosecond_fraction(col.view()) + ) + elif field == "day_of_year": + c_result = move(cpp_day_of_year(col.view())) + else: + raise ValueError(f"Invalid datetime field: '{field}'") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index ae5d33aaa46..afb95dba5b3 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -286,7 +286,7 @@ cdef class GroupBy: Returns ------- - Tuple[List[int], Table, Table]] + Tuple[List[int], Table, Table] A tuple of tables containing three items: - A list of integer offsets into the group keys/values - A table of group keys diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 667c67f4c36..438b0ff1634 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -20,6 +20,8 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:func:`read_avro`. + Parameters ---------- source_info: SourceInfo diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index df1f1b14247..981ca7b8159 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -59,6 +59,8 @@ cdef class ChunkedParquetReader: """ Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:class:`chunked_parquet_reader`. + Parameters ---------- source_info : SourceInfo @@ -167,6 +169,8 @@ cpdef read_parquet( ): """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:func:`read_parquet`. + Parameters ---------- source_info : SourceInfo diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index b5a7445df36..b3f6a92d85c 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -20,6 +20,8 @@ cpdef Column label_bins( ): """Labels elements based on membership in the specified bins. + For details see :cpp:func:`label_bins`. + Parameters ---------- input : Column diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 0c352a5068b..f2dd22f43aa 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const column_view &seeds, const size_type width, ) except + + + cdef unique_ptr[column] word_minhash( + const column_view &input, + const column_view &seeds + ) except + + + cdef unique_ptr[column] word_minhash64( + const column_view &input, + const column_view &seeds + ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index bd6e2e0af02..abf4357f862 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx) +set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd index c2fb5f0dce4..eac0f748257 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd @@ -24,4 +24,9 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] like( column_view source_strings, string_scalar pattern, - string_scalar escape) except + + string_scalar escape_character) except + + + cdef unique_ptr[column] like( + column_view source_strings, + column_view patterns, + string_scalar escape_character) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd index 12cd628fc1f..b7166167cfd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd @@ -10,5 +10,9 @@ from pylibcudf.libcudf.table.table cimport table cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[table] extract( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + + + cdef unique_ptr[column] extract_all_record( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index b25724586e1..e0a8b776465 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -9,5 +9,5 @@ from pylibcudf.libcudf.strings.regex_program cimport regex_program cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( - column_view source_strings, - regex_program) except + + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd index 410ff58f299..59262820411 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd @@ -10,9 +10,9 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] repeat_strings( - column_view strings, - size_type repeat) except + + column_view input, + size_type repeat_times) except + cdef unique_ptr[column] repeat_strings( - column_view strings, - column_view repeats) except + + column_view input, + column_view repeat_times) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 3a89299f11a..019ff3f17ba 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,10 +1,10 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - ctypedef enum side_type: + cpdef enum class side_type(int32_t): LEFT 'cudf::strings::side_type::LEFT' RIGHT 'cudf::strings::side_type::RIGHT' BOTH 'cudf::strings::side_type::BOTH' diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index 947caddc485..6f82124d06e 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -52,6 +52,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): All other columns will be duplicated for each element in the list. + For details, see :cpp:func:`explode_outer`. + Parameters ---------- input : Table @@ -75,6 +77,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): cpdef Column concatenate_rows(Table input): """Concatenate multiple lists columns into a single lists column row-wise. + For details, see :cpp:func:`concatenate_list_elements`. + Parameters ---------- input : Table @@ -96,6 +100,8 @@ cpdef Column concatenate_rows(Table input): cpdef Column concatenate_list_elements(Column input, bool dropna): """Concatenate multiple lists on the same row into a single list. + For details, see :cpp:func:`concatenate_list_elements`. + Parameters ---------- input : Column @@ -168,6 +174,8 @@ cpdef Column contains_nulls(Column input): """Create a column of bool values indicating whether each row in the lists column contains a null value. + For details, see :cpp:func:`contains_nulls`. + Parameters ---------- input : Column @@ -290,6 +298,8 @@ cpdef Column segmented_gather(Column input, Column gather_map_list): cpdef Column extract_list_element(Column input, ColumnOrSizeType index): """Create a column of extracted list elements. + For details, see :cpp:func:`extract_list_element`. + Parameters ---------- input : Column @@ -318,6 +328,8 @@ cpdef Column count_elements(Column input): list element in the given lists column. For details, see :cpp:func:`count_elements`. + For details, see :cpp:func:`count_elements`. + Parameters ---------- input : Column diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index a7d43c9d158..6d707b67449 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -19,6 +19,8 @@ cpdef Table merge ( ): """Merge a set of sorted tables. + For details see :cpp:func:`merge`. + Parameters ---------- tables_to_merge : list diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index b847ade774d..3a771fbe7ef 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -30,6 +30,8 @@ cpdef Column quantile( Computes the specified quantiles by interpolating values between which they lie, using the interpolation strategy specified in interp. + For details see :cpp:func:`quantile`. + Parameters ---------- input: Column @@ -91,6 +93,8 @@ cpdef Table quantiles( specified quantiles. In the event a quantile lies in between rows, the specified interpolation strategy is used to pick between the rows. + For details see :cpp:func:`quantiles`. + Parameters ---------- input: Table diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index a99145be900..eb1499ebbea 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -23,6 +23,8 @@ cpdef Column interleave_columns(Table source_table): in = [[A1, A2, A3], [B1, B2, B3]] return = [A1, B1, A2, B2, A3, B3] + For details, see :cpp:func:`interleave_columns`. + Parameters ---------- source_table: Table @@ -44,6 +46,8 @@ cpdef Column interleave_columns(Table source_table): cpdef Table tile(Table source_table, size_type count): """Repeats the rows from input table count times to form a new table. + For details, see :cpp:func:`tile`. + Parameters ---------- source_table: Table diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index ff2468f3f9c..814bc6553d8 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -19,6 +19,8 @@ cpdef Column lower_bound( ): """Find smallest indices in haystack where needles may be inserted to retain order. + For details, see :cpp:func:`lower_bound`. + Parameters ---------- haystack : Table @@ -58,6 +60,8 @@ cpdef Column upper_bound( ): """Find largest indices in haystack where needles may be inserted to retain order. + For details, see :cpp:func:`upper_bound`. + Parameters ---------- haystack : Table @@ -92,6 +96,8 @@ cpdef Column upper_bound( cpdef Column contains(Column haystack, Column needles): """Check whether needles are present in haystack. + For details, see :cpp:func:`contains`. + Parameters ---------- haystack : Table diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index bd173eebacb..42289d54bca 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -16,6 +16,8 @@ from .table cimport Table cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): """Computes the row indices required to sort the table. + For details, see :cpp:func:`sorted_order`. + Parameters ---------- source_table : Table @@ -52,6 +54,8 @@ cpdef Column stable_sorted_order( """Computes the row indices required to sort the table, preserving order of equal elements. + For details, see :cpp:func:`stable_sorted_order`. + Parameters ---------- source_table : Table @@ -90,6 +94,8 @@ cpdef Column rank( ): """Computes the rank of each element in the column. + For details, see :cpp:func:`rank`. + Parameters ---------- input_view : Column @@ -128,6 +134,8 @@ cpdef Column rank( cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): """Checks if the table is sorted. + For details, see :cpp:func:`is_sorted`. + Parameters ---------- tbl : Table @@ -165,6 +173,8 @@ cpdef Table segmented_sort_by_key( ): """Sorts the table by key, within segments. + For details, see :cpp:func:`segmented_sort_by_key`. + Parameters ---------- values : Table @@ -209,6 +219,8 @@ cpdef Table stable_segmented_sort_by_key( """Sorts the table by key preserving order of equal elements, within segments. + For details, see :cpp:func:`stable_segmented_sort_by_key`. + Parameters ---------- values : Table @@ -251,6 +263,8 @@ cpdef Table sort_by_key( ): """Sorts the table by key. + For details, see :cpp:func:`sort_by_key`. + Parameters ---------- values : Table @@ -290,6 +304,8 @@ cpdef Table stable_sort_by_key( ): """Sorts the table by key preserving order of equal elements. + For details, see :cpp:func:`stable_sort_by_key`. + Parameters ---------- values : Table @@ -324,6 +340,8 @@ cpdef Table stable_sort_by_key( cpdef Table sort(Table source_table, list column_order, list null_precedence): """Sorts the table. + For details, see :cpp:func:`sort`. + Parameters ---------- source_table : Table @@ -355,6 +373,8 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): """Sorts the table preserving order of equal elements. + For details, see :cpp:func:`stable_sort`. + Parameters ---------- source_table : Table diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index b574bfa9fa2..d5475ea79d5 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -25,6 +25,8 @@ from .table cimport Table cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of nulls. + For details, see :cpp:func:`drop_nulls`. + Parameters ---------- source_table : Table @@ -53,6 +55,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of NaNs. + For details, see :cpp:func:`drop_nans`. + Parameters ---------- source_table : Table @@ -81,6 +85,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """Filters out rows from the input table based on a boolean mask. + For details, see :cpp:func:`apply_boolean_mask`. + Parameters ---------- source_table : Table @@ -111,6 +117,8 @@ cpdef Table unique( ): """Filter duplicate consecutive rows from the input table. + For details, see :cpp:func:`unique`. + Parameters ---------- input : Table @@ -153,6 +161,8 @@ cpdef Table distinct( ): """Get the distinct rows from the input table. + For details, see :cpp:func:`distinct`. + Parameters ---------- input : Table @@ -191,6 +201,8 @@ cpdef Column distinct_indices( ): """Get the indices of the distinct rows from the input table. + For details, see :cpp:func:`distinct_indices`. + Parameters ---------- input : Table @@ -226,6 +238,8 @@ cpdef Table stable_distinct( ): """Get the distinct rows from the input table, preserving input order. + For details, see :cpp:func:`stable_distinct`. + Parameters ---------- input : Table @@ -263,6 +277,8 @@ cpdef size_type unique_count( ): """Returns the number of unique consecutive elements in the input column. + For details, see :cpp:func:`unique_count`. + Parameters ---------- source : Column @@ -294,6 +310,8 @@ cpdef size_type distinct_count( ): """Returns the number of distinct elements in the input column. + For details, see :cpp:func:`distinct_count`. + Parameters ---------- source : Column diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index b499a127541..77f20b0b917 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx - regex_program.pyx replace.pyx slice.pyx +set(cython_sources + capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx + regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx ) set(linked_libraries cudf::cudf) @@ -22,3 +23,5 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf ) + +add_subdirectory(convert) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index d1f632d6d8e..91d884b294b 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,9 +5,14 @@ from . cimport ( case, char_types, contains, + convert, + extract, find, + findall, regex_flags, regex_program, replace, slice, + strip, ) +from .side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index ef102aff2af..b4856784390 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,9 +5,15 @@ case, char_types, contains, + convert, + extract, find, + findall, regex_flags, regex_program, + repeat, replace, slice, + strip, ) +from .side_type import SideType diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd index 2cd4891a0ea..6146a1119d6 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pxd +++ b/python/pylibcudf/pylibcudf/strings/contains.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar from pylibcudf.strings.regex_program cimport RegexProgram +ctypedef fused ColumnOrScalar: + Column + Scalar cpdef Column contains_re(Column input, RegexProgram prog) + +cpdef Column count_re(Column input, RegexProgram prog) + +cpdef Column matches_re(Column input, RegexProgram prog) + +cpdef Column like( + Column input, + ColumnOrScalar pattern, + Scalar escape_character = * +) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 1a2446f6e2c..82bd1fbea32 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -1,8 +1,14 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from cython.operator import dereference + from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) from pylibcudf.libcudf.strings cimport contains as cpp_contains from pylibcudf.strings.regex_program cimport RegexProgram @@ -32,9 +38,131 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = cpp_contains.contains_re( + result = move(cpp_contains.contains_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column count_re( + Column input, + RegexProgram prog +): + """Returns the number of times the given regex_program's pattern + matches in each string. + + For details, see :cpp:func:`cudf::strings::count_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of match counts for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.count_re( input.view(), prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column matches_re( + Column input, + RegexProgram prog +): + """Returns a boolean column identifying rows which + matching the given regex_program object but only at + the beginning the string. + + For details, see :cpp:func:`cudf::strings::matches_re`. + + Parameters + ---------- + input : Column + The input strings + prog : RegexProgram + Regex program instance + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_contains.matches_re( + input.view(), + prog.c_obj.get()[0] + )) + + return Column.from_libcudf(move(result)) + + +cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character=None): + """ + Returns a boolean column identifying rows which + match the given like pattern. + + For details, see :cpp:func:`cudf::strings::like`. + + Parameters + ---------- + input : Column + The input strings + pattern : Column or Scalar + Like patterns to match within each string + escape_character : Scalar + Optional character specifies the escape prefix. + Default is no escape character. + + Returns + ------- + pylibcudf.Column + New column of boolean results for each string + """ + cdef unique_ptr[column] result + + if escape_character is None: + escape_character = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) ) + cdef const string_scalar* c_escape_character = ( + escape_character.c_obj.get() + ) + cdef const string_scalar* c_pattern + + if ColumnOrScalar is Column: + with nogil: + result = move(cpp_contains.like( + input.view(), + pattern.view(), + dereference(c_escape_character) + )) + elif ColumnOrScalar is Scalar: + c_pattern = (pattern.c_obj.get()) + with nogil: + result = move(cpp_contains.like( + input.view(), + dereference(c_pattern), + dereference(c_escape_character) + )) + else: + raise ValueError("pattern must be a Column or a Scalar") + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt new file mode 100644 index 00000000000..175c9b3738e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources convert_durations.pyx convert_datetime.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd new file mode 100644 index 00000000000..05324cb49df --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . cimport convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py new file mode 100644 index 00000000000..d803399d53c --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from . import convert_datetime, convert_durations diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd new file mode 100644 index 00000000000..07c84d263d6 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx new file mode 100644 index 00000000000..fcacb096f87 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -0,0 +1,56 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_datetime as cpp_convert_datetime, +) + +from pylibcudf.types import DataType + + +cpdef Column to_timestamps( + Column input, + DataType timestamp_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.to_timestamps( + input.view(), + timestamp_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_timestamps( + Column input, + const string& format, + Column input_strings_names +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.from_timestamps( + input.view(), + format, + input_strings_names.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_timestamp( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_datetime.is_timestamp( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd new file mode 100644 index 00000000000..ac11b8959ed --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +) + +cpdef Column from_durations( + Column input, + const string& format +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx new file mode 100644 index 00000000000..f3e0b7c9c8e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -0,0 +1,41 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_durations as cpp_convert_durations, +) + +from pylibcudf.types import DataType + + +cpdef Column to_durations( + Column input, + DataType duration_type, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.to_durations( + input.view(), + duration_type.c_obj, + format + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_durations( + Column input, + const string& format +): + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_convert_durations.from_durations( + input.view(), + format + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd new file mode 100644 index 00000000000..3871f5a0e4e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog) + +cpdef Column extract_all_record(Column input, RegexProgram prog) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx new file mode 100644 index 00000000000..dcb11ca10ce --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport extract as cpp_extract +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table extract(Column input, RegexProgram prog): + """ + Returns a table of strings columns where each column + corresponds to the matching group specified in the given + egex_program object. + + For details, see :cpp:func:`cudf::strings::extract`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Table + Columns of strings extracted from the input column. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = move( + cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Column extract_all_record(Column input, RegexProgram prog): + """ + Returns a lists column of strings where each string column + row corresponds to the matching group specified in the given + regex_program object. + + For details, see :cpp:func:`cudf::strings::extract_all_record`. + + Parameters + ---------- + input : Column + Strings instance for this operation + prog : RegexProgram + Regex program instance + + Returns + ------- + Column + Lists column containing strings extracted from the input column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd new file mode 100644 index 00000000000..54afa088141 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx new file mode 100644 index 00000000000..3a6b87504b3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport findall as cpp_findall +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column findall(Column input, RegexProgram pattern): + """ + Returns a lists column of strings for each matching occurrence using + the regex_program pattern within each string. + + For details, see :cpp:func:`cudf::strings::findall`. + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New lists column of strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_findall.findall( + input.view(), + pattern.c_obj.get()[0] + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd new file mode 100644 index 00000000000..bc70926b6fa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnorSizeType: + Column + size_type + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx new file mode 100644 index 00000000000..5f627218f6e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -0,0 +1,51 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport repeat as cpp_repeat +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): + """ + Repeat each string in the given strings column by the numbers + of times given in another numeric column. + + For details, see :cpp:func:`cudf::strings::repeat`. + + Parameters + ---------- + input : Column + The column containing strings to repeat. + repeat_times : Column or int + Number(s) of times that the corresponding input strings + for each row are repeated. + + Returns + ------- + Column + New column containing the repeated strings. + """ + cdef unique_ptr[column] c_result + + if ColumnorSizeType is Column: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() + ) + ) + elif ColumnorSizeType is size_type: + with nogil: + c_result = move( + cpp_repeat.repeat_strings( + input.view(), + repeat_times + ) + ) + else: + raise ValueError("repeat_times must be size_type or integer") + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd new file mode 100644 index 00000000000..34b7a580380 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -0,0 +1,3 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx new file mode 100644 index 00000000000..acdc7d6ff1f --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.libcudf.strings.side_type import \ + side_type as SideType # no-cython-lint diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd new file mode 100644 index 00000000000..8bbe4753edd --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=*, + Scalar to_strip=* +) diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx new file mode 100644 index 00000000000..429a23c3cdf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/strip.pyx @@ -0,0 +1,60 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport strip as cpp_strip +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.side_type cimport side_type + + +cpdef Column strip( + Column input, + side_type side=side_type.BOTH, + Scalar to_strip=None +): + """Removes the specified characters from the beginning + or end (or both) of each string. + + For details, see :cpp:func:`cudf::strings::strip`. + + Parameters + ---------- + input : Column + Strings column for this operation + side : SideType, default SideType.BOTH + Indicates characters are to be stripped from the beginning, + end, or both of each string; Default is both + to_strip : Scalar + UTF-8 encoded characters to strip from each string; + Default is empty string which indicates strip whitespace characters + + Returns + ------- + pylibcudf.Column + New strings column. + """ + + if to_strip is None: + to_strip = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef unique_ptr[column] c_result + cdef string_scalar* cpp_to_strip + cpp_to_strip = (to_strip.c_obj.get()) + + with nogil: + c_result = cpp_strip.strip( + input.view(), + side, + dereference(cpp_to_strip) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini index 1761c0f011c..f572f85ca49 100644 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/tests/pytest.ini @@ -6,3 +6,4 @@ filterwarnings = error ignore:::.*xdist.* ignore:::.*pytest.* +addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index d3aa6101e2d..89c96829e71 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import datetime +import functools import pyarrow as pa import pyarrow.compute as pc @@ -10,7 +11,7 @@ @pytest.fixture -def column(has_nulls): +def date_column(has_nulls): values = [ datetime.date(1999, 1, 1), datetime.date(2024, 10, 12), @@ -22,9 +23,41 @@ def column(has_nulls): return plc.interop.from_arrow(pa.array(values, type=pa.date32())) -def test_extract_year(column): - got = plc.datetime.extract_year(column) +@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) +def datetime_column(has_nulls, request): + values = [ + datetime.datetime(1999, 1, 1), + datetime.datetime(2024, 10, 12), + datetime.datetime(1970, 1, 1), + datetime.datetime(2260, 1, 1), + datetime.datetime(2024, 2, 29, 3, 14, 15), + datetime.datetime(2024, 2, 29, 3, 14, 15, 999), + ] + if has_nulls: + values[2] = None + return plc.interop.from_arrow( + pa.array(values, type=pa.timestamp(request.param)) + ) + + +@pytest.mark.parametrize( + "component, pc_fun", + [ + ("year", pc.year), + ("month", pc.month), + ("day", pc.day), + ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), + ("hour", pc.hour), + ("minute", pc.minute), + ("second", pc.second), + ("millisecond", pc.millisecond), + ("microsecond", pc.microsecond), + ("nanosecond", pc.nanosecond), + ], +) +def test_extraction(datetime_column, component, pc_fun): + got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16()) + expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4f88e09183f..4e4dd7cbb00 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -48,3 +48,40 @@ def test_contains_re(target_col, pa_target_scalar, plc_target_pat): pa_target_col, pa_target_scalar.as_py() ) assert_column_eq(got, expected) + + +def test_count_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["A1a2A3a4", "A1A2A3", None]) + result = plc.strings.contains.count_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.count_substring_regex(arr, pattern) + assert_column_eq(result, expected) + + +def test_match_re(): + pattern = "[1-9][a-z]" + arr = pa.array(["1a2b", "b1a2", None]) + result = plc.strings.contains.matches_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + expected = pc.match_substring_regex(arr, f"^{pattern}") + assert_column_eq(result, expected) + + +def test_like(): + pattern = "%a" + arr = pa.array(["1a2aa3aaa"]) + result = plc.strings.contains.like( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(pa.array([pattern])), + ) + expected = pc.match_like(arr, pattern) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py new file mode 100644 index 00000000000..e9e95459d0e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -0,0 +1,85 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture( + scope="module", + params=[ + pa.timestamp("ns"), + pa.timestamp("us"), + pa.timestamp("ms"), + pa.timestamp("s"), + ], +) +def timestamp_type(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_timestamp_col(): + return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_timestamp_col(pa_timestamp_col): + return plc.interop.from_arrow(pa_timestamp_col) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +@pytest.mark.parametrize("format", ["%Y-%m-%d"]) +def test_to_datetime( + pa_timestamp_col, plc_timestamp_col, timestamp_type, format +): + expect = pa.compute.strptime(pa_timestamp_col, format, timestamp_type.unit) + got = plc.strings.convert.convert_datetime.to_timestamps( + plc_timestamp_col, + plc.interop.from_arrow(timestamp_type), + format.encode(), + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", ["%H:%M:%S"]) +def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format.encode(), + ) + assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py new file mode 100644 index 00000000000..788b86423c4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + + +def test_extract(): + pattern = "([ab])(\\d)" + pa_pattern = "(?P[ab])(?P\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pc.extract_regex(arr, pa_pattern) + for i, result_col in enumerate(result.itercolumns()): + expected_col = pa.chunked_array(expected.field(i)) + assert result_col.fill_null("").equals(expected_col) + + +def test_extract_all_record(): + pattern = "([ab])(\\d)" + arr = pa.array(["a1", "b2", "c3"]) + plc_result = plc.strings.extract.extract_all_record( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array( + [pa.array([["a", "1"], ["b", "2"], None], type=result.type)] + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py new file mode 100644 index 00000000000..994552fa276 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import re + +import pyarrow as pa +import pylibcudf as plc +from utils import assert_column_eq + + +def test_findall(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[ab]" + result = plc.strings.findall.findall( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [re.findall(pattern, elem) for elem in arr.to_pylist()], + type=pa_result.type, + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py new file mode 100644 index 00000000000..18b5d8bf4d0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest + + +@pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) +def test_repeat_strings(repeats): + arr = pa.array(["1", None]) + plc_result = plc.strings.repeat.repeat_strings( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(repeats) + if not isinstance(repeats, int) + else repeats, + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.binary_repeat(arr, repeats)) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py new file mode 100644 index 00000000000..005e5e4a405 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -0,0 +1,122 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_column_eq + +data_strings = [ + "AbC", + "123abc", + "", + " ", + None, + "aAaaaAAaa", + " ab c ", + "abc123", + " ", + "\tabc\t", + "\nabc\n", + "\r\nabc\r\n", + "\t\n abc \n\t", + "!@#$%^&*()", + " abc!!! ", + " abc\t\n!!! ", + "__abc__", + "abc\n\n", + "123abc456", + "abcxyzabc", +] + +strip_chars = [ + "a", + "", + " ", + "\t", + "\n", + "\r\n", + "!", + "@#", + "123", + "xyz", + "abc", + "__", + " \t\n", + "abc123", +] + + +@pytest.fixture +def pa_col(): + return pa.array(data_strings, type=pa.string()) + + +@pytest.fixture +def plc_col(pa_col): + return plc.interop.from_arrow(pa_col) + + +@pytest.fixture(params=strip_chars) +def pa_char(request): + return pa.scalar(request.param, type=pa.string()) + + +@pytest.fixture +def plc_char(pa_char): + return plc.interop.from_arrow(pa_char) + + +def test_strip(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.strip() + return st.strip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.BOTH, plc_char) + assert_column_eq(expected, got) + + +def test_strip_right(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.rstrip() + return st.rstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip( + plc_col, plc.strings.SideType.RIGHT, plc_char + ) + assert_column_eq(expected, got) + + +def test_strip_left(pa_col, plc_col, pa_char, plc_char): + def strip_string(st, char): + if st is None: + return None + + elif char == "": + return st.lstrip() + return st.lstrip(char) + + expected = pa.array( + [strip_string(x, pa_char.as_py()) for x in pa_col.to_pylist()], + type=pa.string(), + ) + + got = plc.strings.strip.strip(plc_col, plc.strings.SideType.LEFT, plc_char) + assert_column_eq(expected, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index 06fc35d8835..d5c618f07e4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -29,3 +29,54 @@ def test_nans_to_nulls(has_nans): got = input.with_mask(mask, null_count) assert_column_eq(expect, got) + + +def test_bools_to_mask_roundtrip(): + pa_array = pa.array([True, None, False]) + plc_input = plc.interop.from_arrow(pa_array) + mask, result_null_count = plc.transform.bools_to_mask(plc_input) + + assert result_null_count == 2 + result = plc_input.with_mask(mask, result_null_count) + assert_column_eq(pa.array([True, None, None]), result) + + plc_output = plc.transform.mask_to_bools(mask.ptr, 0, len(pa_array)) + result_pa = plc.interop.to_arrow(plc_output) + expected_pa = pa.chunked_array([[True, False, False]]) + assert result_pa.equals(expected_pa) + + +def test_encode(): + pa_table = pa.table({"a": [1, 3, 4], "b": [1, 2, 4]}) + plc_input = plc.interop.from_arrow(pa_table) + result_table, result_column = plc.transform.encode(plc_input) + pa_table_result = plc.interop.to_arrow(result_table) + pa_column_result = plc.interop.to_arrow(result_column) + + pa_table_expected = pa.table( + [[1, 3, 4], [1, 2, 4]], + schema=pa.schema( + [ + pa.field("", pa.int64(), nullable=False), + pa.field("", pa.int64(), nullable=False), + ] + ), + ) + assert pa_table_result.equals(pa_table_expected) + + pa_column_expected = pa.chunked_array([[0, 1, 2]], type=pa.int32()) + assert pa_column_result.equals(pa_column_expected) + + +def test_one_hot_encode(): + pa_column = pa.array([1, 2, 3]) + pa_categories = pa.array([0, 0, 0]) + plc_input = plc.interop.from_arrow(pa_column) + plc_categories = plc.interop.from_arrow(pa_categories) + plc_table = plc.transform.one_hot_encode(plc_input, plc_categories) + result = plc.interop.to_arrow(plc_table) + expected = pa.table( + [[False] * 3] * 3, + schema=pa.schema([pa.field("", pa.bool_(), nullable=False)] * 3), + ) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd index 4b21feffe25..b530f433c97 100644 --- a/python/pylibcudf/pylibcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/transform.pxd @@ -1,7 +1,21 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool +from pylibcudf.libcudf.types cimport bitmask_type, data_type from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .table cimport Table +from .types cimport DataType cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input) + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input) + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit) + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx) + +cpdef tuple[Table, Column] encode(Table input) + +cpdef Table one_hot_encode(Column input_column, Column categories) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index 100ccb580ce..de425a27c15 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -1,19 +1,27 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.string cimport string from libcpp.utility cimport move, pair from pylibcudf.libcudf cimport transform as cpp_transform -from pylibcudf.libcudf.types cimport size_type +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.table.table_view cimport table_view +from pylibcudf.libcudf.types cimport bitmask_type, size_type from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .types cimport DataType +from .utils cimport int_to_bitmask_ptr cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """Create a null mask preserving existing nulls and converting nans to null. + For details, see :cpp:func:`nans_to_nulls`. + Parameters ---------- input : Column @@ -32,3 +40,141 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), c_result.second ) + + +cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): + """Create a bitmask from a column of boolean elements + + Parameters + ---------- + input : Column + Column to produce new mask from. + + Returns + ------- + tuple[gpumemoryview, int] + Two-tuple of a gpumemoryview wrapping the bitmask and the null count. + """ + cdef pair[unique_ptr[device_buffer], size_type] c_result + + with nogil: + c_result = move(cpp_transform.bools_to_mask(input.view())) + + return ( + gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), + c_result.second + ) + + +cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): + """Creates a boolean column from given bitmask. + + Parameters + ---------- + bitmask : int + Pointer to the bitmask which needs to be converted + begin_bit : int + Position of the bit from which the conversion should start + end_bit : int + Position of the bit before which the conversion should stop + + Returns + ------- + Column + Boolean column of the bitmask from [begin_bit, end_bit] + """ + cdef unique_ptr[column] c_result + cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask) + + with nogil: + c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column transform(Column input, str unary_udf, DataType output_type, bool is_ptx): + """Create a new column by applying a unary function against every + element of an input column. + + Parameters + ---------- + input : Column + Column to transform. + unary_udf : str + The PTX/CUDA string of the unary function to apply. + output_type : DataType + The output type that is compatible with the output type in the unary_udf. + is_ptx : bool + If `True`, the UDF is treated as PTX code. + If `False`, the UDF is treated as CUDA code. + + Returns + ------- + Column + The transformed column having the UDF applied to each element. + """ + cdef unique_ptr[column] c_result + cdef string c_unary_udf = unary_udf.encode() + cdef bool c_is_ptx = is_ptx + + with nogil: + c_result = move( + cpp_transform.transform( + input.view(), c_unary_udf, output_type.c_obj, c_is_ptx + ) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef tuple[Table, Column] encode(Table input): + """Encode the rows of the given table as integers. + + Parameters + ---------- + input : Table + Table containing values to be encoded + + Returns + ------- + tuple[Table, Column] + The distinct row of the input table in sorted order, + and a column of integer indices representing the encoded rows. + """ + cdef pair[unique_ptr[table], unique_ptr[column]] c_result + + with nogil: + c_result = move(cpp_transform.encode(input.view())) + + return ( + Table.from_libcudf(move(c_result.first)), + Column.from_libcudf(move(c_result.second)) + ) + +cpdef Table one_hot_encode(Column input, Column categories): + """Encodes `input` by generating a new column + for each value in `categories` indicating the presence + of that value in `input`. + + Parameters + ---------- + input : Column + Column containing values to be encoded. + categories : Column + Column containing categories + + Returns + ------- + Column + A table of the encoded values. + """ + cdef pair[unique_ptr[column], table_view] c_result + cdef Table owner_table + + with nogil: + c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view())) + + owner_table = Table( + [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + ) + + return Table.from_table_view(c_result.second, owner_table)