diff --git a/.github/labeler.yml b/.github/labeler.yml index 41fd3802d55..3a868ac7d45 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,14 +3,14 @@ cuDF (Python): - 'python/**' - 'notebooks/**' - + libcudf: - 'cpp/**' CMake: - '**/CMakeLists.txt' - '**/cmake/**' - + cuDF (Java): - 'java/**' diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml new file mode 100644 index 00000000000..b301c56a999 --- /dev/null +++ b/.github/workflows/add_to_project.yml @@ -0,0 +1,20 @@ +name: Add new issue/PR to project + +on: + issues: + types: + - opened + + pull_request_target: + types: + - opened + +jobs: + add-to-project: + name: Add issue or PR to project + runs-on: ubuntu-latest + steps: + - uses: actions/add-to-project@v0.3.0 + with: + project-url: https://github.com/orgs/rapidsai/projects/51 + github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }} diff --git a/.github/workflows/dependency-files.yml b/.github/workflows/dependency-files.yml new file mode 100644 index 00000000000..2ae939292d7 --- /dev/null +++ b/.github/workflows/dependency-files.yml @@ -0,0 +1,12 @@ +name: pr + +on: + pull_request: + +jobs: + checks: + secrets: inherit + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@main + with: + enable_check_size: false + enable_check_style: false diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml deleted file mode 100644 index 741e159fbd8..00000000000 --- a/.github/workflows/stale.yaml +++ /dev/null @@ -1,57 +0,0 @@ -name: Mark inactive issues and pull requests - -on: - schedule: - - cron: "0 * * * *" - -jobs: - mark-inactive-30d: - runs-on: ubuntu-latest - steps: - - name: Mark 30 day inactive issues - uses: actions/stale@v3 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - stale-issue-message: > - This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days. - Please close this issue if no further response or action is needed. - Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. - This issue will be labeled `inactive-90d` if there is no activity in the next 60 days. - stale-issue-label: "inactive-30d" - exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue" - days-before-issue-stale: 30 - days-before-issue-close: -1 - stale-pr-message: > - This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days. - Please close this PR if it is no longer required. - Otherwise, please respond with a comment indicating any updates. - This PR will be labeled `inactive-90d` if there is no activity in the next 60 days. - stale-pr-label: "inactive-30d" - exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue" - days-before-pr-stale: 30 - days-before-pr-close: -1 - operations-per-run: 50 - mark-inactive-90d: - runs-on: ubuntu-latest - steps: - - name: Mark 90 day inactive issues - uses: actions/stale@v3 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - stale-issue-message: > - This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days. - Please close this issue if no further response or action is needed. - Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed. - stale-issue-label: "inactive-90d" - exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue" - days-before-issue-stale: 90 - days-before-issue-close: -1 - stale-pr-message: > - This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days. - Please close this PR if it is no longer required. - Otherwise, please respond with a comment indicating any updates. - stale-pr-label: "inactive-90d" - exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue" - days-before-pr-stale: 90 - days-before-pr-close: -1 - operations-per-run: 50 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 00000000000..7f1c708c9a7 --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,77 @@ +name: cuDF wheels + +on: + workflow_call: + inputs: + versioneer-override: + type: string + default: '' + build-tag: + type: string + default: '' + branch: + required: true + type: string + date: + required: true + type: string + sha: + required: true + type: string + build-type: + type: string + default: nightly + +concurrency: + group: "cudf-${{ github.workflow }}-${{ github.ref }}" + cancel-in-progress: true + +jobs: + cudf-wheels: + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main + with: + repo: rapidsai/cudf + + build-type: ${{ inputs.build-type }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + + package-dir: python/cudf + package-name: cudf + + python-package-versioneer-override: ${{ inputs.versioneer-override }} + python-package-build-tag: ${{ inputs.build-tag }} + + skbuild-configure-options: "-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF" + + test-extras: test + + # Have to manually specify the cupy install location on arm. + # Have to also manually install tokenizers==0.10.2, which is the last tokenizers + # to have a binary aarch64 wheel available on PyPI + # Otherwise, the tokenizers sdist is used, which needs a Rust compiler + test-before-arm64: "pip install tokenizers==0.10.2 cupy-cuda11x -f https://pip.cupy.dev/aarch64" + + test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests" + secrets: inherit + dask_cudf-wheel: + needs: cudf-wheels + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure.yml@main + with: + repo: rapidsai/cudf + + build-type: ${{ inputs.build-type }} + branch: ${{ inputs.branch }} + sha: ${{ inputs.sha }} + date: ${{ inputs.date }} + + package-dir: python/dask_cudf + package-name: dask_cudf + + python-package-versioneer-override: ${{ inputs.versioneer-override }} + python-package-build-tag: ${{ inputs.build-tag }} + + test-extras: test + test-unittest: "pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests" + secrets: inherit diff --git a/.gitignore b/.gitignore index 0d63c76bf9f..1867e65b7be 100644 --- a/.gitignore +++ b/.gitignore @@ -70,7 +70,6 @@ junit-cudf.xml test-results ## Patching -*.diff *.orig *.rej @@ -166,3 +165,8 @@ dask-worker-space/ # Sphinx docs & build artifacts docs/cudf/source/api_docs/generated/* docs/cudf/source/api_docs/api/* +docs/cudf/source/user_guide/example_output/* +docs/cudf/source/user_guide/cudf.*Dtype.*.rst + +# cibuildwheel +/wheelhouse diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1046f4ebe6f..75d285f4f54 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,19 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + exclude: | + (?x)^( + ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* + ) + - id: end-of-file-fixer + exclude: | + (?x)^( + ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* + ) - repo: https://github.com/PyCQA/isort rev: 5.10.1 hooks: @@ -18,12 +31,18 @@ repos: # Explicitly specify the pyproject.toml at the repo root, not per-project. args: ["--config", "pyproject.toml"] - repo: https://github.com/PyCQA/flake8 - rev: 3.8.3 + rev: 5.0.4 hooks: - id: flake8 args: ["--config=setup.cfg"] - files: python/.*\.(py|pyx|pxd)$ + files: python/.*$ types: [file] + types_or: [python, cython] + additional_dependencies: ["flake8-force"] + - repo: https://github.com/MarcoGorelli/cython-lint + rev: v0.1.10 + hooks: + - id: cython-lint - repo: https://github.com/pre-commit/mirrors-mypy rev: 'v0.971' hooks: @@ -46,6 +65,16 @@ repos: - id: clang-format types_or: [c, c++, cuda] args: ["-fallback-style=none", "-style=file", "-i"] + - repo: https://github.com/sirosen/texthooks + rev: 0.4.0 + hooks: + - id: fix-smartquotes + exclude: | + (?x)^( + ^cpp/include/cudf_test/cxxopts.hpp| + ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*| + ^python/cudf/cudf/tests/test_text.py + ) - repo: local hooks: - id: no-deprecationwarning diff --git a/CHANGELOG.md b/CHANGELOG.md index be350329cf5..b872e954d87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,233 @@ +# cuDF 22.12.00 (8 Dec 2022) + +## 🚨 Breaking Changes + +- Add JNI for `substring` without 'end' parameter. ([#12113](https://github.com/rapidsai/cudf/pull/12113)) [@firestarman](https://github.com/firestarman) +- Refactor `purge_nonempty_nulls` ([#12111](https://github.com/rapidsai/cudf/pull/12111)) [@ttnghia](https://github.com/ttnghia) +- Create an `int8` column in `read_csv` when all elements are missing ([#12110](https://github.com/rapidsai/cudf/pull/12110)) [@vuule](https://github.com/vuule) +- Throw an error when libcudf is built without cuFile and `LIBCUDF_CUFILE_POLICY` is set to `"ALWAYS"` ([#12080](https://github.com/rapidsai/cudf/pull/12080)) [@vuule](https://github.com/vuule) +- Fix type promotion edge cases in numerical binops ([#12074](https://github.com/rapidsai/cudf/pull/12074)) [@wence-](https://github.com/wence-) +- Reduce/Remove reliance on `**kwargs` and `*args` in `IO` readers & writers ([#12025](https://github.com/rapidsai/cudf/pull/12025)) [@galipremsagar](https://github.com/galipremsagar) +- Rollback of `DeviceBufferLike` ([#12009](https://github.com/rapidsai/cudf/pull/12009)) [@madsbk](https://github.com/madsbk) +- Remove unused `managed_allocator` ([#12005](https://github.com/rapidsai/cudf/pull/12005)) [@vyasr](https://github.com/vyasr) +- Pass column names to `write_csv` instead of `table_metadata` pointer ([#11972](https://github.com/rapidsai/cudf/pull/11972)) [@vuule](https://github.com/vuule) +- Accept const refs instead of const unique_ptr refs in reduce and scan APIs. ([#11960](https://github.com/rapidsai/cudf/pull/11960)) [@vyasr](https://github.com/vyasr) +- Default to equal NaNs in make_merge_sets_aggregation. ([#11952](https://github.com/rapidsai/cudf/pull/11952)) [@bdice](https://github.com/bdice) +- Remove validation that requires introspection ([#11938](https://github.com/rapidsai/cudf/pull/11938)) [@vyasr](https://github.com/vyasr) +- Trim quotes for non-string values in nested json parsing ([#11898](https://github.com/rapidsai/cudf/pull/11898)) [@karthikeyann](https://github.com/karthikeyann) +- Add tests ensuring that cudf's default stream is always used ([#11875](https://github.com/rapidsai/cudf/pull/11875)) [@vyasr](https://github.com/vyasr) +- Support nested types as groupby keys in libcudf ([#11792](https://github.com/rapidsai/cudf/pull/11792)) [@PointKernel](https://github.com/PointKernel) +- Default to equal NaNs in make_collect_set_aggregation. ([#11621](https://github.com/rapidsai/cudf/pull/11621)) [@bdice](https://github.com/bdice) +- Removing int8 column option from parquet byte_array writing ([#11539](https://github.com/rapidsai/cudf/pull/11539)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- part1: Simplify BaseIndex to an abstract class ([#10389](https://github.com/rapidsai/cudf/pull/10389)) [@skirui-source](https://github.com/skirui-source) + +## πŸ› Bug Fixes + +- Fix include line for IO Cython modules ([#12250](https://github.com/rapidsai/cudf/pull/12250)) [@vyasr](https://github.com/vyasr) +- Make dask pinning looser ([#12231](https://github.com/rapidsai/cudf/pull/12231)) [@vyasr](https://github.com/vyasr) +- Workaround for CUB segmented-sort bug with boolean keys ([#12217](https://github.com/rapidsai/cudf/pull/12217)) [@davidwendt](https://github.com/davidwendt) +- Fix `from_dict` backend dispatch to match upstream `dask` ([#12203](https://github.com/rapidsai/cudf/pull/12203)) [@galipremsagar](https://github.com/galipremsagar) +- Merge branch-22.10 into branch-22.12 ([#12198](https://github.com/rapidsai/cudf/pull/12198)) [@davidwendt](https://github.com/davidwendt) +- Fix compression in ORC writer ([#12194](https://github.com/rapidsai/cudf/pull/12194)) [@vuule](https://github.com/vuule) +- Don't use CMake 3.25.0 as it has a show stopping FindCUDAToolkit bug ([#12188](https://github.com/rapidsai/cudf/pull/12188)) [@robertmaynard](https://github.com/robertmaynard) +- Fix data corruption when reading ORC files with empty stripes ([#12160](https://github.com/rapidsai/cudf/pull/12160)) [@vuule](https://github.com/vuule) +- Fix decimal binary operations ([#12142](https://github.com/rapidsai/cudf/pull/12142)) [@galipremsagar](https://github.com/galipremsagar) +- Ensure dlpack include is provided to cudf interop lib ([#12139](https://github.com/rapidsai/cudf/pull/12139)) [@robertmaynard](https://github.com/robertmaynard) +- Safely allocate `udf_string` pointers in `strings_udf` ([#12138](https://github.com/rapidsai/cudf/pull/12138)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Fix/disable jitify lto ([#12122](https://github.com/rapidsai/cudf/pull/12122)) [@robertmaynard](https://github.com/robertmaynard) +- Fix conditional_full_join benchmark ([#12121](https://github.com/rapidsai/cudf/pull/12121)) [@GregoryKimball](https://github.com/GregoryKimball) +- Fix regex working-memory-size refactor error ([#12119](https://github.com/rapidsai/cudf/pull/12119)) [@davidwendt](https://github.com/davidwendt) +- Add in negative size checks for columns ([#12118](https://github.com/rapidsai/cudf/pull/12118)) [@revans2](https://github.com/revans2) +- Add JNI for `substring` without 'end' parameter. ([#12113](https://github.com/rapidsai/cudf/pull/12113)) [@firestarman](https://github.com/firestarman) +- Fix reading of CSV files with blank second row ([#12098](https://github.com/rapidsai/cudf/pull/12098)) [@vuule](https://github.com/vuule) +- Fix an error in IO with `GzipFile` type ([#12085](https://github.com/rapidsai/cudf/pull/12085)) [@galipremsagar](https://github.com/galipremsagar) +- Workaround groupby aggregate thrust::copy_if overflow ([#12079](https://github.com/rapidsai/cudf/pull/12079)) [@davidwendt](https://github.com/davidwendt) +- Fix alignment of compressed blocks in ORC writer ([#12077](https://github.com/rapidsai/cudf/pull/12077)) [@vuule](https://github.com/vuule) +- Fix singleton-range `__setitem__` edge case ([#12075](https://github.com/rapidsai/cudf/pull/12075)) [@wence-](https://github.com/wence-) +- Fix type promotion edge cases in numerical binops ([#12074](https://github.com/rapidsai/cudf/pull/12074)) [@wence-](https://github.com/wence-) +- Force using old fmt in nvbench. ([#12067](https://github.com/rapidsai/cudf/pull/12067)) [@vyasr](https://github.com/vyasr) +- Fixes List offset bug in Nested JSON reader ([#12060](https://github.com/rapidsai/cudf/pull/12060)) [@karthikeyann](https://github.com/karthikeyann) +- Allow falling back to `shim_60.ptx` by default in `strings_udf` ([#12056](https://github.com/rapidsai/cudf/pull/12056)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Force black exclusions for pre-commit. ([#12036](https://github.com/rapidsai/cudf/pull/12036)) [@bdice](https://github.com/bdice) +- Add `memory_usage` & `items` implementation for `Struct` column & dtype ([#12033](https://github.com/rapidsai/cudf/pull/12033)) [@galipremsagar](https://github.com/galipremsagar) +- Reduce/Remove reliance on `**kwargs` and `*args` in `IO` readers & writers ([#12025](https://github.com/rapidsai/cudf/pull/12025)) [@galipremsagar](https://github.com/galipremsagar) +- Fixes bug in csv_reader_options construction in cython ([#12021](https://github.com/rapidsai/cudf/pull/12021)) [@karthikeyann](https://github.com/karthikeyann) +- Fix issues when both `usecols` and `names` options are used in `read_csv` ([#12018](https://github.com/rapidsai/cudf/pull/12018)) [@vuule](https://github.com/vuule) +- Port thrust's pinned_allocator to cudf, since Thrust 1.17 removes the type ([#12004](https://github.com/rapidsai/cudf/pull/12004)) [@robertmaynard](https://github.com/robertmaynard) +- Revert "Replace most of preprocessor usage in nvcomp adapter with `constexpr`" ([#11999](https://github.com/rapidsai/cudf/pull/11999)) [@vuule](https://github.com/vuule) +- Fix bug where `df.loc` resulting in single row could give wrong index ([#11998](https://github.com/rapidsai/cudf/pull/11998)) [@eriknw](https://github.com/eriknw) +- Switch to DISABLE_DEPRECATION_WARNINGS to match other RAPIDS projects ([#11989](https://github.com/rapidsai/cudf/pull/11989)) [@robertmaynard](https://github.com/robertmaynard) +- Fix maximum page size estimate in Parquet writer ([#11962](https://github.com/rapidsai/cudf/pull/11962)) [@vuule](https://github.com/vuule) +- Fix local offset handling in bgzip reader ([#11918](https://github.com/rapidsai/cudf/pull/11918)) [@upsj](https://github.com/upsj) +- Fix an issue reading struct-of-list types in Parquet. ([#11910](https://github.com/rapidsai/cudf/pull/11910)) [@nvdbaranec](https://github.com/nvdbaranec) +- Fix memcheck error in TypeInference.Timestamp gtest ([#11905](https://github.com/rapidsai/cudf/pull/11905)) [@davidwendt](https://github.com/davidwendt) +- Fix type casting in Series.__setitem__ ([#11904](https://github.com/rapidsai/cudf/pull/11904)) [@wence-](https://github.com/wence-) +- Fix memcheck error in get_dremel_data ([#11903](https://github.com/rapidsai/cudf/pull/11903)) [@davidwendt](https://github.com/davidwendt) +- Fixes Unsupported column type error due to empty list columns in Nested JSON reader ([#11897](https://github.com/rapidsai/cudf/pull/11897)) [@karthikeyann](https://github.com/karthikeyann) +- Fix segmented-sort to ignore indices outside the offsets ([#11888](https://github.com/rapidsai/cudf/pull/11888)) [@davidwendt](https://github.com/davidwendt) +- Fix cudf::stable_sorted_order for NaN and -NaN in FLOAT64 columns ([#11874](https://github.com/rapidsai/cudf/pull/11874)) [@davidwendt](https://github.com/davidwendt) +- Fix writing of Parquet files with many fragments ([#11869](https://github.com/rapidsai/cudf/pull/11869)) [@etseidl](https://github.com/etseidl) +- Fix RangeIndex unary operators. ([#11868](https://github.com/rapidsai/cudf/pull/11868)) [@vyasr](https://github.com/vyasr) +- JNI Avoid NPE for reading host binary data ([#11865](https://github.com/rapidsai/cudf/pull/11865)) [@revans2](https://github.com/revans2) +- Fix decimal benchmark input data generation ([#11863](https://github.com/rapidsai/cudf/pull/11863)) [@karthikeyann](https://github.com/karthikeyann) +- Fix pre-commit copyright check ([#11860](https://github.com/rapidsai/cudf/pull/11860)) [@galipremsagar](https://github.com/galipremsagar) +- Fix Parquet support for seconds and milliseconds duration types ([#11854](https://github.com/rapidsai/cudf/pull/11854)) [@vuule](https://github.com/vuule) +- Ensure better compiler cache results between cudf cal-ver branches ([#11835](https://github.com/rapidsai/cudf/pull/11835)) [@robertmaynard](https://github.com/robertmaynard) +- Fix make_column_from_scalar for all-null strings column ([#11807](https://github.com/rapidsai/cudf/pull/11807)) [@davidwendt](https://github.com/davidwendt) +- Tell jitify_preprocess where to search for libnvrtc ([#11787](https://github.com/rapidsai/cudf/pull/11787)) [@robertmaynard](https://github.com/robertmaynard) +- add V2 page header support to parquet reader ([#11778](https://github.com/rapidsai/cudf/pull/11778)) [@etseidl](https://github.com/etseidl) +- Parquet reader: bug fix for a num_rows/skip_rows corner case, w/optimization for nested preprocessing ([#11752](https://github.com/rapidsai/cudf/pull/11752)) [@nvdbaranec](https://github.com/nvdbaranec) +- Determine if Arrow has S3 support at runtime in unit test. ([#11560](https://github.com/rapidsai/cudf/pull/11560)) [@bdice](https://github.com/bdice) + +## πŸ“– Documentation + +- Use rapidsai CODE_OF_CONDUCT.md ([#12166](https://github.com/rapidsai/cudf/pull/12166)) [@bdice](https://github.com/bdice) +- Add symlinks to notebooks. ([#12128](https://github.com/rapidsai/cudf/pull/12128)) [@bdice](https://github.com/bdice) +- Add `truncate` API to python doc pages ([#12109](https://github.com/rapidsai/cudf/pull/12109)) [@galipremsagar](https://github.com/galipremsagar) +- Update Numba docs links. ([#12107](https://github.com/rapidsai/cudf/pull/12107)) [@bdice](https://github.com/bdice) +- Remove "Multi-GPU with Dask-cuDF" notebook. ([#12095](https://github.com/rapidsai/cudf/pull/12095)) [@bdice](https://github.com/bdice) +- Fix link to c++ developer guide from `CONTRIBUTING.md` ([#12084](https://github.com/rapidsai/cudf/pull/12084)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add pivot_table and crosstab to docs. ([#12014](https://github.com/rapidsai/cudf/pull/12014)) [@bdice](https://github.com/bdice) +- Fix doxygen text for cudf::dictionary::encode ([#11991](https://github.com/rapidsai/cudf/pull/11991)) [@davidwendt](https://github.com/davidwendt) +- Replace default_stream_value with get_default_stream in docs. ([#11985](https://github.com/rapidsai/cudf/pull/11985)) [@vyasr](https://github.com/vyasr) +- Add dtype docs pages and docstrings for `cudf` specific dtypes ([#11974](https://github.com/rapidsai/cudf/pull/11974)) [@galipremsagar](https://github.com/galipremsagar) +- Update Unit Testing in libcudf guidelines to code tests outside the cudf::test namespace ([#11959](https://github.com/rapidsai/cudf/pull/11959)) [@davidwendt](https://github.com/davidwendt) +- Rename libcudf++ to libcudf. ([#11953](https://github.com/rapidsai/cudf/pull/11953)) [@bdice](https://github.com/bdice) +- Fix documentation referring to removed as_gpu_matrix method. ([#11937](https://github.com/rapidsai/cudf/pull/11937)) [@bdice](https://github.com/bdice) +- Remove "experimental" warning for struct columns in ORC reader and writer ([#11880](https://github.com/rapidsai/cudf/pull/11880)) [@vuule](https://github.com/vuule) +- Initial draft of policies and guidelines for libcudf usage. ([#11853](https://github.com/rapidsai/cudf/pull/11853)) [@vyasr](https://github.com/vyasr) +- Add clear indication of non-GPU accelerated parameters in read_json docstring ([#11825](https://github.com/rapidsai/cudf/pull/11825)) [@GregoryKimball](https://github.com/GregoryKimball) +- Add developer docs for writing tests ([#11199](https://github.com/rapidsai/cudf/pull/11199)) [@vyasr](https://github.com/vyasr) + +## πŸš€ New Features + +- Adds an EventHandler to Java MemoryBuffer to be invoked on close ([#12125](https://github.com/rapidsai/cudf/pull/12125)) [@abellina](https://github.com/abellina) +- Support `+` in `strings_udf` ([#12117](https://github.com/rapidsai/cudf/pull/12117)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Support `upper` and `lower` in `strings_udf` ([#12099](https://github.com/rapidsai/cudf/pull/12099)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add wheel builds ([#12096](https://github.com/rapidsai/cudf/pull/12096)) [@vyasr](https://github.com/vyasr) +- Allow setting malloc heap size in string udfs ([#12094](https://github.com/rapidsai/cudf/pull/12094)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Support `strip`, `lstrip`, and `rstrip` in `strings_udf` ([#12091](https://github.com/rapidsai/cudf/pull/12091)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Mark nvcomp zstd compression stable ([#12059](https://github.com/rapidsai/cudf/pull/12059)) [@jbrennan333](https://github.com/jbrennan333) +- Add debug-only onAllocated/onDeallocated to RmmEventHandler ([#12054](https://github.com/rapidsai/cudf/pull/12054)) [@abellina](https://github.com/abellina) +- Enable building against the libarrow contained in pyarrow ([#12034](https://github.com/rapidsai/cudf/pull/12034)) [@vyasr](https://github.com/vyasr) +- Add strings `like` jni and native method ([#12032](https://github.com/rapidsai/cudf/pull/12032)) [@cindyyuanjiang](https://github.com/cindyyuanjiang) +- Cleanup common parsing code in JSON, CSV reader ([#12022](https://github.com/rapidsai/cudf/pull/12022)) [@karthikeyann](https://github.com/karthikeyann) +- byte_range support for JSON Lines format ([#12017](https://github.com/rapidsai/cudf/pull/12017)) [@karthikeyann](https://github.com/karthikeyann) +- Minor cleanup of root CMakeLists.txt for better organization ([#11988](https://github.com/rapidsai/cudf/pull/11988)) [@robertmaynard](https://github.com/robertmaynard) +- Add inplace arithmetic operators to `MaskedType` ([#11987](https://github.com/rapidsai/cudf/pull/11987)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Implement JNI for chunked Parquet reader ([#11961](https://github.com/rapidsai/cudf/pull/11961)) [@ttnghia](https://github.com/ttnghia) +- Add method argument to DataFrame.quantile ([#11957](https://github.com/rapidsai/cudf/pull/11957)) [@rjzamora](https://github.com/rjzamora) +- Add gpu memory watermark apis to JNI ([#11950](https://github.com/rapidsai/cudf/pull/11950)) [@abellina](https://github.com/abellina) +- Adds retryCount to RmmEventHandler.onAllocFailure ([#11940](https://github.com/rapidsai/cudf/pull/11940)) [@abellina](https://github.com/abellina) +- Enable returning string data from UDFs used through `apply` ([#11933](https://github.com/rapidsai/cudf/pull/11933)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Switch over to rapids-cmake patches for thrust ([#11921](https://github.com/rapidsai/cudf/pull/11921)) [@robertmaynard](https://github.com/robertmaynard) +- Add strings udf C++ classes and functions for phase II ([#11912](https://github.com/rapidsai/cudf/pull/11912)) [@davidwendt](https://github.com/davidwendt) +- Trim quotes for non-string values in nested json parsing ([#11898](https://github.com/rapidsai/cudf/pull/11898)) [@karthikeyann](https://github.com/karthikeyann) +- Enable CEC for `strings_udf` ([#11884](https://github.com/rapidsai/cudf/pull/11884)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- ArrowIPCTableWriter writes en empty batch in the case of an empty table. ([#11883](https://github.com/rapidsai/cudf/pull/11883)) [@firestarman](https://github.com/firestarman) +- Implement chunked Parquet reader ([#11867](https://github.com/rapidsai/cudf/pull/11867)) [@ttnghia](https://github.com/ttnghia) +- Add `read_orc_metadata` to libcudf ([#11815](https://github.com/rapidsai/cudf/pull/11815)) [@vuule](https://github.com/vuule) +- Support nested types as groupby keys in libcudf ([#11792](https://github.com/rapidsai/cudf/pull/11792)) [@PointKernel](https://github.com/PointKernel) +- Adding feature Truncate to DataFrame and Series ([#11435](https://github.com/rapidsai/cudf/pull/11435)) [@VamsiTallam95](https://github.com/VamsiTallam95) + +## πŸ› οΈ Improvements + +- Reduce number of tests marked `spilling` ([#12197](https://github.com/rapidsai/cudf/pull/12197)) [@madsbk](https://github.com/madsbk) +- Pin `dask` and `distributed` for release ([#12165](https://github.com/rapidsai/cudf/pull/12165)) [@galipremsagar](https://github.com/galipremsagar) +- Don't rely on GNU find in headers_test.sh ([#12164](https://github.com/rapidsai/cudf/pull/12164)) [@wence-](https://github.com/wence-) +- Update cp.clip call ([#12148](https://github.com/rapidsai/cudf/pull/12148)) [@quasiben](https://github.com/quasiben) +- Enable automatic column projection in groupby().agg ([#12124](https://github.com/rapidsai/cudf/pull/12124)) [@rjzamora](https://github.com/rjzamora) +- Refactor `purge_nonempty_nulls` ([#12111](https://github.com/rapidsai/cudf/pull/12111)) [@ttnghia](https://github.com/ttnghia) +- Create an `int8` column in `read_csv` when all elements are missing ([#12110](https://github.com/rapidsai/cudf/pull/12110)) [@vuule](https://github.com/vuule) +- Spilling to host memory ([#12106](https://github.com/rapidsai/cudf/pull/12106)) [@madsbk](https://github.com/madsbk) +- First pass of `pd.read_orc` changes in tests ([#12103](https://github.com/rapidsai/cudf/pull/12103)) [@galipremsagar](https://github.com/galipremsagar) +- Expose engine argument in dask_cudf.read_json ([#12101](https://github.com/rapidsai/cudf/pull/12101)) [@rjzamora](https://github.com/rjzamora) +- Remove CUDA 10 compatibility code. ([#12088](https://github.com/rapidsai/cudf/pull/12088)) [@bdice](https://github.com/bdice) +- Move and update `dask` nigthly install in CI ([#12082](https://github.com/rapidsai/cudf/pull/12082)) [@galipremsagar](https://github.com/galipremsagar) +- Throw an error when libcudf is built without cuFile and `LIBCUDF_CUFILE_POLICY` is set to `"ALWAYS"` ([#12080](https://github.com/rapidsai/cudf/pull/12080)) [@vuule](https://github.com/vuule) +- Remove macros that inspect the contents of exceptions ([#12076](https://github.com/rapidsai/cudf/pull/12076)) [@vyasr](https://github.com/vyasr) +- Fix ingest_raw_data performance issue in Nested JSON reader due to RVO ([#12070](https://github.com/rapidsai/cudf/pull/12070)) [@karthikeyann](https://github.com/karthikeyann) +- Remove overflow error during decimal binops ([#12063](https://github.com/rapidsai/cudf/pull/12063)) [@galipremsagar](https://github.com/galipremsagar) +- Change cudf::detail::tdigest to cudf::tdigest::detail ([#12050](https://github.com/rapidsai/cudf/pull/12050)) [@davidwendt](https://github.com/davidwendt) +- Fix quantile gtests coded in namespace cudf::test ([#12049](https://github.com/rapidsai/cudf/pull/12049)) [@davidwendt](https://github.com/davidwendt) +- Add support for `DataFrame.from_dict`\`to_dict` and `Series.to_dict` ([#12048](https://github.com/rapidsai/cudf/pull/12048)) [@galipremsagar](https://github.com/galipremsagar) +- Refactor Parquet reader ([#12046](https://github.com/rapidsai/cudf/pull/12046)) [@ttnghia](https://github.com/ttnghia) +- Forward merge 22.10 into 22.12 ([#12045](https://github.com/rapidsai/cudf/pull/12045)) [@vyasr](https://github.com/vyasr) +- Standardize newlines at ends of files. ([#12042](https://github.com/rapidsai/cudf/pull/12042)) [@bdice](https://github.com/bdice) +- Trim trailing whitespace from all files. ([#12041](https://github.com/rapidsai/cudf/pull/12041)) [@bdice](https://github.com/bdice) +- Use nosync policy in gather and scatter implementations. ([#12038](https://github.com/rapidsai/cudf/pull/12038)) [@bdice](https://github.com/bdice) +- Remove smart quotes from all docstrings. ([#12035](https://github.com/rapidsai/cudf/pull/12035)) [@bdice](https://github.com/bdice) +- Update cuda-python dependency to 11.7.1 ([#12030](https://github.com/rapidsai/cudf/pull/12030)) [@galipremsagar](https://github.com/galipremsagar) +- Add cython-lint to pre-commit checks. ([#12020](https://github.com/rapidsai/cudf/pull/12020)) [@bdice](https://github.com/bdice) +- Use pragma once ([#12019](https://github.com/rapidsai/cudf/pull/12019)) [@bdice](https://github.com/bdice) +- New GHA to add issues/prs to project board ([#12016](https://github.com/rapidsai/cudf/pull/12016)) [@jarmak-nv](https://github.com/jarmak-nv) +- Add DataFrame.pivot_table. ([#12015](https://github.com/rapidsai/cudf/pull/12015)) [@bdice](https://github.com/bdice) +- Rollback of `DeviceBufferLike` ([#12009](https://github.com/rapidsai/cudf/pull/12009)) [@madsbk](https://github.com/madsbk) +- Remove default parameters for nvtext::detail functions ([#12007](https://github.com/rapidsai/cudf/pull/12007)) [@davidwendt](https://github.com/davidwendt) +- Remove default parameters for cudf::dictionary::detail functions ([#12006](https://github.com/rapidsai/cudf/pull/12006)) [@davidwendt](https://github.com/davidwendt) +- Remove unused `managed_allocator` ([#12005](https://github.com/rapidsai/cudf/pull/12005)) [@vyasr](https://github.com/vyasr) +- Remove default parameters for cudf::strings::detail functions ([#12003](https://github.com/rapidsai/cudf/pull/12003)) [@davidwendt](https://github.com/davidwendt) +- Remove unnecessary code from dask-cudf _Frame ([#12001](https://github.com/rapidsai/cudf/pull/12001)) [@rjzamora](https://github.com/rjzamora) +- Ignore python docs build artifacts ([#12000](https://github.com/rapidsai/cudf/pull/12000)) [@galipremsagar](https://github.com/galipremsagar) +- Use rapids-cmake for google benchmark. ([#11997](https://github.com/rapidsai/cudf/pull/11997)) [@vyasr](https://github.com/vyasr) +- Leverage rapids_cython for more automated RPATH handling ([#11996](https://github.com/rapidsai/cudf/pull/11996)) [@vyasr](https://github.com/vyasr) +- Remove stale labeler ([#11995](https://github.com/rapidsai/cudf/pull/11995)) [@raydouglass](https://github.com/raydouglass) +- Move protobuf compilation to CMake ([#11986](https://github.com/rapidsai/cudf/pull/11986)) [@vyasr](https://github.com/vyasr) +- Replace most of preprocessor usage in nvcomp adapter with `constexpr` ([#11980](https://github.com/rapidsai/cudf/pull/11980)) [@vuule](https://github.com/vuule) +- Add missing noexcepts to column_in_metadata methods ([#11973](https://github.com/rapidsai/cudf/pull/11973)) [@vyasr](https://github.com/vyasr) +- Pass column names to `write_csv` instead of `table_metadata` pointer ([#11972](https://github.com/rapidsai/cudf/pull/11972)) [@vuule](https://github.com/vuule) +- Accelerate libcudf segmented sort with CUB segmented sort ([#11969](https://github.com/rapidsai/cudf/pull/11969)) [@davidwendt](https://github.com/davidwendt) +- Feature/remove default streams ([#11967](https://github.com/rapidsai/cudf/pull/11967)) [@vyasr](https://github.com/vyasr) +- Add pool memory resource to libcudf basic example ([#11966](https://github.com/rapidsai/cudf/pull/11966)) [@davidwendt](https://github.com/davidwendt) +- Fix some libcudf calls to cudf::detail::gather ([#11963](https://github.com/rapidsai/cudf/pull/11963)) [@davidwendt](https://github.com/davidwendt) +- Accept const refs instead of const unique_ptr refs in reduce and scan APIs. ([#11960](https://github.com/rapidsai/cudf/pull/11960)) [@vyasr](https://github.com/vyasr) +- Add deprecation warning for set_allocator. ([#11958](https://github.com/rapidsai/cudf/pull/11958)) [@vyasr](https://github.com/vyasr) +- Fix lists and structs gtests coded in namespace cudf::test ([#11956](https://github.com/rapidsai/cudf/pull/11956)) [@davidwendt](https://github.com/davidwendt) +- Add full page indexes to Parquet writer benchmarks ([#11955](https://github.com/rapidsai/cudf/pull/11955)) [@etseidl](https://github.com/etseidl) +- Use gather-based strings factory in cudf::strings::strip ([#11954](https://github.com/rapidsai/cudf/pull/11954)) [@davidwendt](https://github.com/davidwendt) +- Default to equal NaNs in make_merge_sets_aggregation. ([#11952](https://github.com/rapidsai/cudf/pull/11952)) [@bdice](https://github.com/bdice) +- Add `strip_delimiters` option to `read_text` ([#11946](https://github.com/rapidsai/cudf/pull/11946)) [@upsj](https://github.com/upsj) +- Refactor multibyte_split `output_builder` ([#11945](https://github.com/rapidsai/cudf/pull/11945)) [@upsj](https://github.com/upsj) +- Remove validation that requires introspection ([#11938](https://github.com/rapidsai/cudf/pull/11938)) [@vyasr](https://github.com/vyasr) +- Add `.str.find_multiple` API ([#11928](https://github.com/rapidsai/cudf/pull/11928)) [@galipremsagar](https://github.com/galipremsagar) +- Add regex_program class for use with all regex APIs ([#11927](https://github.com/rapidsai/cudf/pull/11927)) [@davidwendt](https://github.com/davidwendt) +- Enable backend dispatching for Dask-DataFrame creation ([#11920](https://github.com/rapidsai/cudf/pull/11920)) [@rjzamora](https://github.com/rjzamora) +- Performance improvement in JSON Tree traversal ([#11919](https://github.com/rapidsai/cudf/pull/11919)) [@karthikeyann](https://github.com/karthikeyann) +- Fix some gtests incorrectly coded in namespace cudf::test (part I) ([#11917](https://github.com/rapidsai/cudf/pull/11917)) [@davidwendt](https://github.com/davidwendt) +- Refactor pad/zfill functions for reuse with strings udf ([#11914](https://github.com/rapidsai/cudf/pull/11914)) [@davidwendt](https://github.com/davidwendt) +- Add `nanosecond` & `microsecond` to `DatetimeProperties` ([#11911](https://github.com/rapidsai/cudf/pull/11911)) [@galipremsagar](https://github.com/galipremsagar) +- Pin mimesis version in setup.py. ([#11906](https://github.com/rapidsai/cudf/pull/11906)) [@bdice](https://github.com/bdice) +- Error on `ListColumn` or any new unsupported column in `cudf.Index` ([#11902](https://github.com/rapidsai/cudf/pull/11902)) [@galipremsagar](https://github.com/galipremsagar) +- Add thrust output iterator fix (1805) to thrust.patch ([#11900](https://github.com/rapidsai/cudf/pull/11900)) [@davidwendt](https://github.com/davidwendt) +- Relax `codecov` threshold diff ([#11899](https://github.com/rapidsai/cudf/pull/11899)) [@galipremsagar](https://github.com/galipremsagar) +- Use public APIs in STREAM_COMPACTION_NVBENCH ([#11892](https://github.com/rapidsai/cudf/pull/11892)) [@GregoryKimball](https://github.com/GregoryKimball) +- Add coverage for string UDF tests. ([#11891](https://github.com/rapidsai/cudf/pull/11891)) [@vyasr](https://github.com/vyasr) +- Provide `data_chunk_source` wrapper for `datasource` ([#11886](https://github.com/rapidsai/cudf/pull/11886)) [@upsj](https://github.com/upsj) +- Handle `multibyte_split` byte_range out-of-bounds offsets on host ([#11885](https://github.com/rapidsai/cudf/pull/11885)) [@upsj](https://github.com/upsj) +- Add tests ensuring that cudf's default stream is always used ([#11875](https://github.com/rapidsai/cudf/pull/11875)) [@vyasr](https://github.com/vyasr) +- Change expect_strings_empty into expect_column_empty libcudf test utility ([#11873](https://github.com/rapidsai/cudf/pull/11873)) [@davidwendt](https://github.com/davidwendt) +- Add ngroup ([#11871](https://github.com/rapidsai/cudf/pull/11871)) [@shwina](https://github.com/shwina) +- Reduce memory usage in nested JSON parser - tree generation ([#11864](https://github.com/rapidsai/cudf/pull/11864)) [@karthikeyann](https://github.com/karthikeyann) +- Unpin `dask` and `distributed` for development ([#11859](https://github.com/rapidsai/cudf/pull/11859)) [@galipremsagar](https://github.com/galipremsagar) +- Remove unused includes for table/row_operators ([#11857](https://github.com/rapidsai/cudf/pull/11857)) [@GregoryKimball](https://github.com/GregoryKimball) +- Use conda-forge's `pyorc` ([#11855](https://github.com/rapidsai/cudf/pull/11855)) [@jakirkham](https://github.com/jakirkham) +- Add libcudf strings examples ([#11849](https://github.com/rapidsai/cudf/pull/11849)) [@davidwendt](https://github.com/davidwendt) +- Remove `cudf_io` namespace alias ([#11827](https://github.com/rapidsai/cudf/pull/11827)) [@vuule](https://github.com/vuule) +- Test/remove thrust vector usage ([#11813](https://github.com/rapidsai/cudf/pull/11813)) [@vyasr](https://github.com/vyasr) +- Add BGZIP reader to python `read_text` ([#11802](https://github.com/rapidsai/cudf/pull/11802)) [@upsj](https://github.com/upsj) +- Merge branch-22.10 into branch-22.12 ([#11801](https://github.com/rapidsai/cudf/pull/11801)) [@davidwendt](https://github.com/davidwendt) +- Fix compile warning from CUDF_FUNC_RANGE in a member function ([#11798](https://github.com/rapidsai/cudf/pull/11798)) [@davidwendt](https://github.com/davidwendt) +- Update cudf JNI version to 22.12.0-SNAPSHOT ([#11764](https://github.com/rapidsai/cudf/pull/11764)) [@pxLi](https://github.com/pxLi) +- Update flake8 to 5.0.4 and use flake8-force to check Cython. ([#11736](https://github.com/rapidsai/cudf/pull/11736)) [@bdice](https://github.com/bdice) +- Add BGZIP multibyte_split benchmark ([#11723](https://github.com/rapidsai/cudf/pull/11723)) [@upsj](https://github.com/upsj) +- Bifurcate Dependency Lists ([#11674](https://github.com/rapidsai/cudf/pull/11674)) [@bdice](https://github.com/bdice) +- Default to equal NaNs in make_collect_set_aggregation. ([#11621](https://github.com/rapidsai/cudf/pull/11621)) [@bdice](https://github.com/bdice) +- Conform "bench_isin" to match generator column names ([#11549](https://github.com/rapidsai/cudf/pull/11549)) [@GregoryKimball](https://github.com/GregoryKimball) +- Removing int8 column option from parquet byte_array writing ([#11539](https://github.com/rapidsai/cudf/pull/11539)) [@hyperbolic2346](https://github.com/hyperbolic2346) +- Add checks for HLG layers in dask-cudf groupby tests ([#10853](https://github.com/rapidsai/cudf/pull/10853)) [@charlesbluca](https://github.com/charlesbluca) +- part1: Simplify BaseIndex to an abstract class ([#10389](https://github.com/rapidsai/cudf/pull/10389)) [@skirui-source](https://github.com/skirui-source) +- Make all `nvcc` warnings into errors ([#8916](https://github.com/rapidsai/cudf/pull/8916)) [@trxcllnt](https://github.com/trxcllnt) + # cuDF 22.10.00 (12 Oct 2022) ## 🚨 Breaking Changes diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 3029fbb41af..00000000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1 +0,0 @@ -This project has adopted the [Contributor Covenant Code of Conduct](https://docs.rapids.ai/resources/conduct/). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6eb621abcc3..608bd42d86c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -99,13 +99,13 @@ cd $CUDF_HOME **Note:** Using a conda environment is the easiest way to satisfy the library's dependencies. Instructions for a minimal build environment without conda are included below. -- Create the conda development environment `cudf_dev`: +- Create the conda development environment: ```bash # create the conda environment (assuming in base `cudf` directory) # note: RAPIDS currently doesn't support `channel_priority: strict`; # use `channel_priority: flexible` instead -conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml +conda env create --name cudf_dev --file conda/environments/all_cuda-115_arch-x86_64.yaml # activate the environment conda activate cudf_dev ``` @@ -114,9 +114,6 @@ conda activate cudf_dev development environment may also need to be updated if dependency versions or pinnings are changed. -- For other CUDA versions, check the corresponding `cudf_dev_cuda*.yml` file in - `conda/environments/`. - #### Building without a conda environment - libcudf has the following minimal dependencies (in addition to those listed in the [General @@ -382,7 +379,7 @@ You can skip these checks with `git commit --no-verify` or with the short versio ## Developer Guidelines -The [C++ Developer Guide](cpp/docs/DEVELOPER_GUIDE.md) includes details on contributing to libcudf C++ code. +The [C++ Developer Guide](cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md) includes details on contributing to libcudf C++ code. The [Python Developer Guide](https://docs.rapids.ai/api/cudf/stable/developer_guide/index.html) includes details on contributing to cuDF Python code. diff --git a/README.md b/README.md index 641ce1316b3..a013d3a9ea4 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ For additional examples, browse our complete [API documentation](https://docs.ra ## Quick Start -Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF. +Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you're running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF. ## Installation diff --git a/build.sh b/build.sh index bda3d83798a..e62da9791da 100755 --- a/build.sh +++ b/build.sh @@ -64,7 +64,7 @@ BUILD_BENCHMARKS=OFF BUILD_ALL_GPU_ARCH=0 BUILD_NVTX=ON BUILD_TESTS=OFF -BUILD_DISABLE_DEPRECATION_WARNING=ON +BUILD_DISABLE_DEPRECATION_WARNINGS=ON BUILD_PER_THREAD_DEFAULT_STREAM=OFF BUILD_REPORT_METRICS=OFF BUILD_REPORT_INCL_CACHE_STATS=OFF @@ -216,7 +216,7 @@ if hasArg --opensource_nvcomp; then USE_PROPRIETARY_NVCOMP="OFF" fi if hasArg --show_depr_warn; then - BUILD_DISABLE_DEPRECATION_WARNING=OFF + BUILD_DISABLE_DEPRECATION_WARNINGS=OFF fi if hasArg --ptds; then BUILD_PER_THREAD_DEFAULT_STREAM=ON @@ -285,7 +285,7 @@ if buildAll || hasArg libcudf; then -DCUDF_USE_PROPRIETARY_NVCOMP=${USE_PROPRIETARY_NVCOMP} \ -DBUILD_TESTS=${BUILD_TESTS} \ -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ - -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \ + -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ ${EXTRA_CMAKE_ARGS} diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index ffa48797fe3..e186946a3d0 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -40,7 +40,7 @@ export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" export INSTALL_DASK_MAIN=0 # Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2022.9.2" +export DASK_STABLE_VERSION="2022.11.1" function remove_libcudf_kernel_cache_dir { EXITCODE=$? @@ -82,8 +82,8 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \ # Install the conda-forge or nightly version of dask and distributed if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then - gpuci_logger "gpuci_mamba_retry update dask" - gpuci_mamba_retry update dask + gpuci_logger "gpuci_mamba_retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'" + gpuci_mamba_retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed" else gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py index 61e30d7922e..83f43183f71 100644 --- a/ci/checks/copyright.py +++ b/ci/checks/copyright.py @@ -68,20 +68,40 @@ def modifiedFiles(): we can read only the staged changes. """ repo = git.Repo() - # TARGET_BRANCH is defined in CI + # Use the environment variable TARGET_BRANCH (defined in CI) if possible target_branch = os.environ.get("TARGET_BRANCH") if target_branch is None: # Fall back to the closest branch if not on CI target_branch = repo.git.describe( all=True, tags=True, match="branch-*", abbrev=0 ).lstrip("heads/") - try: - # Use the tracking branch of the local reference if it exists + + upstream_target_branch = None + if target_branch in repo.heads: + # Use the tracking branch of the local reference if it exists. This + # returns None if no tracking branch is set. upstream_target_branch = repo.heads[target_branch].tracking_branch() - except IndexError: - # Fall back to the remote reference (this happens on CI because the - # only local branch reference is current-pr-branch) - upstream_target_branch = repo.remote().refs[target_branch] + if upstream_target_branch is None: + # Fall back to the remote with the newest target_branch. This code + # path is used on CI because the only local branch reference is + # current-pr-branch, and thus target_branch is not in repo.heads. + # This also happens if no tracking branch is defined for the local + # target_branch. We use the remote with the latest commit if + # multiple remotes are defined. + candidate_branches = [ + remote.refs[target_branch] for remote in repo.remotes + if target_branch in remote.refs + ] + if len(candidate_branches) > 0: + upstream_target_branch = sorted( + candidate_branches, + key=lambda branch: branch.commit.committed_datetime, + )[-1] + else: + # If no remotes are defined, try to use the local version of the + # target_branch. If this fails, the repo configuration must be very + # strange and we can fix this script on a case-by-case basis. + upstream_target_branch = repo.heads[target_branch] merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0] diff = merge_base.diff() changed_files = {f for f in diff if f.b_path is not None} diff --git a/ci/checks/headers_test.sh b/ci/checks/headers_test.sh index 502bdca0fa7..b859009a8c5 100755 --- a/ci/checks/headers_test.sh +++ b/ci/checks/headers_test.sh @@ -10,7 +10,7 @@ DIRNAMES="cudf cudf_test" # existence tests for lib${LIBNAME} for DIRNAME in ${DIRNAMES[@]}; do - HEADERS=`cd cpp && find include/${DIRNAME}/ -type f \( -iname "*.h" -o -iname "*.hpp" \) -printf " - test -f \\\$PREFIX/%p\n" | sort` + HEADERS=`cd cpp && find include/${DIRNAME} -type f \( -iname "*.h" -o -iname "*.hpp" \) -print | sed 's|^| - test -f $PREFIX/|' | sort` META_TESTS=`grep -E "test -f .*/include/${DIRNAME}/.*\.h(pp)?" conda/recipes/lib${LIBNAME}/meta.yaml | sort` HEADER_DIFF=`diff <(echo "$HEADERS") <(echo "$META_TESTS")` LIB_RETVAL=$? diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 680321378c0..54cf3928cf4 100755 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -14,7 +14,7 @@ LANG=C.UTF-8 . /opt/conda/etc/profile.d/conda.sh conda activate rapids -FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/cmake-format-rapids-cmake.json +FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/cmake-format-rapids-cmake.json export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE}) wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL} diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 560de6db187..2e12308169f 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -130,7 +130,7 @@ if [ "$BUILD_CUDF" == '1' ]; then gpuci_logger "Build conda pkg for custreamz" gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL - + gpuci_logger "Build conda pkg for strings_udf" gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index b6a27c31614..3a65130f922 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -35,10 +35,10 @@ unset GIT_DESCRIBE_TAG export INSTALL_DASK_MAIN=0 # Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2022.9.2" +export DASK_STABLE_VERSION="2022.11.1" # ucx-py version -export UCX_PY_VERSION='0.28.*' +export UCX_PY_VERSION='0.29.*' ################################################################################ # TRAP - Setup trap for removing jitify cache @@ -96,12 +96,12 @@ function install_dask { gpuci_logger "Install the conda-forge or nightly version of dask and distributed" set -x if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then - gpuci_logger "gpuci_mamba_retry update dask" - gpuci_mamba_retry update dask + gpuci_logger "gpuci_mamba_retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'" + gpuci_mamba_retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed" conda list else gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" - gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall + gpuci_mamba_retry install conda-forge::dask==$DASK_STABLE_VERSION conda-forge::distributed==$DASK_STABLE_VERSION conda-forge::dask-core==$DASK_STABLE_VERSION --force-reinstall fi # Install the main version of streamz gpuci_logger "Install the main version of streamz" @@ -111,6 +111,8 @@ function install_dask { set +x } +install_dask + if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then gpuci_logger "Install dependencies" @@ -126,8 +128,6 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env # gpuci_mamba_retry install -y "your-pkg=1.0.0" - install_dask - ################################################################################ # BUILD - Build libcudf, cuDF, libcudf_kafka, dask_cudf, and strings_udf from source ################################################################################ @@ -197,16 +197,31 @@ else # copied by CI from the upstream 11.5 jobs into $CONDA_ARTIFACT_PATH gpuci_logger "Installing cudf, dask-cudf, cudf_kafka, and custreamz" gpuci_mamba_retry install cudf dask-cudf cudf_kafka custreamz -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}" - + gpuci_logger "Check current conda environment" conda list --show-channel-urls gpuci_logger "GoogleTests" + + # Set up library for finding incorrect default stream usage. + cd "$WORKSPACE/cpp/tests/utilities/identify_stream_usage/" + mkdir build && cd build && cmake .. -GNinja && ninja && ninja test + STREAM_IDENTIFY_LIB="$WORKSPACE/cpp/tests/utilities/identify_stream_usage/build/libidentify_stream_usage.so" + # Run libcudf and libcudf_kafka gtests from libcudf-tests package for gt in "$CONDA_PREFIX/bin/gtests/libcudf"*/* ; do test_name=$(basename ${gt}) + echo "Running GoogleTest $test_name" - ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" + if [[ ${test_name} == "SPAN_TEST" ]]; then + # This one test is specifically designed to test using a thrust device + # vector, so we expect and allow it to include default stream usage. + gtest_filter="SpanTest.CanConstructFromDeviceContainers" + GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" --gtest_filter="-${gtest_filter}" + ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" --gtest_filter="${gtest_filter}" + else + GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" + fi done # Test libcudf (csv, orc, and parquet) with `LIBCUDF_CUFILE_POLICY=KVIKIO` @@ -267,6 +282,10 @@ conda list gpuci_logger "Python py.test for cuDF" py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests +gpuci_logger "Python py.tests for cuDF with spilling (CUDF_SPILL_DEVICE_LIMIT=1)" +# Due to time concerns, we only run tests marked "spilling" +CUDF_SPILL=on CUDF_SPILL_DEVICE_LIMIT=1 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov-append --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope -m spilling tests + cd "$WORKSPACE/python/dask_cudf" gpuci_logger "Python py.test for dask-cudf" py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf @@ -280,22 +299,15 @@ py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml gpuci_logger "Installing strings_udf" gpuci_mamba_retry install strings_udf -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}" -# only install strings_udf after cuDF is finished testing without its presence cd "$WORKSPACE/python/strings_udf/strings_udf" gpuci_logger "Python py.test for strings_udf" +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests -STRINGS_UDF_PYTEST_RETCODE=0 -py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests || STRINGS_UDF_PYTEST_RETCODE=$? +# retest cuDF UDFs +cd "$WORKSPACE/python/cudf/cudf" +gpuci_logger "Python py.test retest cuDF UDFs" +py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-strings-udf-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf-strings-udf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-strings-udf-coverage.xml" --cov-report term --dist=loadscope tests/test_udf_masked_ops.py -if [ ${STRINGS_UDF_PYTEST_RETCODE} -eq 5 ]; then - echo "No strings UDF tests were run, but this script will continue to execute." -elif [ ${STRINGS_UDF_PYTEST_RETCODE} -ne 0 ]; then - exit ${STRINGS_UDF_PYTEST_RETCODE} -else - cd "$WORKSPACE/python/cudf/cudf" - gpuci_logger "Python py.test retest cuDF UDFs" - py.test tests/test_udf_masked_ops.py -n 8 --cache-clear -fi # Run benchmarks with both cudf and pandas to ensure compatibility is maintained. # Benchmarks are run in DEBUG_ONLY mode, meaning that only small data sizes are used. diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh index b110303662b..e1d3bab2bc5 100755 --- a/ci/gpu/java.sh +++ b/ci/gpu/java.sh @@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # ucx-py version -export UCX_PY_VERSION='0.28.*' +export UCX_PY_VERSION='0.29.*' ################################################################################ # TRAP - Setup trap for removing jitify cache diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 8fad4e08c56..9dcfe093643 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -34,6 +34,9 @@ function sed_runner() { # cpp update sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt +# cpp stream testing update +sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/tests/utilities/identify_stream_usage/CMakeLists.txt + # Python update sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt @@ -60,9 +63,10 @@ sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py # bump rmm & dask-cuda -for FILE in conda/environments/*.yml; do - sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE}; +for FILE in conda/environments/*.yaml dependencies.yaml; do sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE}; + sed_runner "s/rmm-cu11=${CURRENT_SHORT_TAG}/rmm-cu11=${NEXT_SHORT_TAG}/g" ${FILE}; done # Doxyfile update @@ -74,6 +78,7 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md # Libcudf examples update sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt +sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt # ucx-py version update sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh diff --git a/codecov.yml b/codecov.yml index f9d0f906807..344d4f3f04e 100644 --- a/codecov.yml +++ b/codecov.yml @@ -2,10 +2,10 @@ coverage: status: project: off - patch: on + patch: default: target: auto - threshold: 0% + threshold: 5% github_checks: annotations: true diff --git a/conda/environments/all_cuda-115_arch-x86_64.yaml b/conda/environments/all_cuda-115_arch-x86_64.yaml new file mode 100644 index 00000000000..cd900efced5 --- /dev/null +++ b/conda/environments/all_cuda-115_arch-x86_64.yaml @@ -0,0 +1,78 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- aiobotocore>=2.2.0 +- arrow-cpp=9 +- boto3>=1.21.21 +- botocore>=1.24.21 +- c-compiler +- cachetools +- cmake>=3.23.1,!=3.25.0 +- cubinlinker +- cuda-python>=11.7.1,<12.0 +- cudatoolkit=11.5 +- cupy>=9.5.0,<12.0.0a0 +- cxx-compiler +- cython>=0.29,<0.30 +- dask-cuda=22.12.* +- dask==2022.11.1 +- distributed==2022.11.1 +- dlpack>=0.5,<0.6.0a0 +- doxygen=1.8.20 +- fastavro>=0.22.9 +- fsspec>=0.6.0 +- gcc_linux-64=9.* +- hypothesis +- ipython +- librdkafka=1.7.0 +- mimesis>=4.1.0 +- moto>=4.0.8 +- myst-nb +- nbsphinx +- notebook>=0.5.0 +- numba>=0.56.2 +- numpy +- numpydoc +- nvcc_linux-64=11.5 +- nvtx>=0.2.1 +- packaging +- pandas>=1.0,<1.6.0dev0 +- pandoc<=2.0.0 +- pip +- pre-commit +- protobuf>=3.20.1,<3.21.0a0 +- ptxcompiler +- pyarrow=9.0.0 +- pydata-sphinx-theme +- pytest +- pytest-benchmark +- pytest-cases +- pytest-cov +- pytest-xdist +- python-confluent-kafka=1.7.0 +- python-snappy>=0.6.0 +- python>=3.8,<3.10 +- pytorch<1.12.0 +- rmm=22.12.* +- s3fs>=2022.3.0 +- scikit-build>=0.13.1 +- scipy +- sphinx +- sphinx-autobuild +- sphinx-copybutton +- sphinx-markdown-tables +- sphinxcontrib-websupport +- streamz +- sysroot_linux-64==2.17 +- transformers +- typing_extensions +- pip: + - git+https://github.com/python-streamz/streamz.git@master + - pyorc +name: all_cuda-115_arch-x86_64 diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml deleted file mode 100644 index 37df0ba48dc..00000000000 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. - -name: cudf_dev -channels: - - rapidsai - - rapidsai-nightly - - dask/label/dev - - conda-forge - - nvidia -dependencies: - - c-compiler - - cxx-compiler - - clang=11.1.0 - - clang-tools=11.1.0 - - cupy>=9.5.0,<12.0.0a0 - - rmm=22.10.* - - cmake>=3.23.1 - - cmake_setuptools>=0.1.3 - - scikit-build>=0.13.1 - - python>=3.8,<3.10 - - numba>=0.56.2 - - numpy - - pandas>=1.0,<1.6.0dev0 - - pyarrow=9 - - fastavro>=0.22.9 - - python-snappy>=0.6.0 - - notebook>=0.5.0 - - cython>=0.29,<0.30 - - fsspec>=0.6.0 - - pytest - - pytest-benchmark - - pytest-cases - - pytest-xdist - - sphinx - - sphinxcontrib-websupport - - nbsphinx - - numpydoc - - ipython - - pandoc<=2.0.0 - - cudatoolkit=11.5 - - cuda-python>=11.7.1,<12.0 - - pip - - flake8=3.8.3 - - black=22.3.0 - - isort=5.10.1 - - mypy=0.971 - - types-cachetools - - doxygen=1.8.20 - - pydocstyle=6.1.1 - - typing_extensions - - pre-commit - - dask==2022.9.2 - - distributed==2022.9.2 - - streamz - - arrow-cpp=9 - - dlpack>=0.5,<0.6.0a0 - - double-conversion - - rapidjson - - hypothesis - - sphinx-markdown-tables - - sphinx-copybutton - - sphinx-autobuild - - myst-nb - - scipy - - dask-cuda=22.10.* - - mimesis<4.1 - - packaging - - protobuf>=3.20.1,<3.21.0a0 - - nvtx>=0.2.1 - - cachetools - - transformers<=4.10.3 - - pydata-sphinx-theme - - librdkafka=1.7.0 - - python-confluent-kafka=1.7.0 - - moto>=3.1.6 - - boto3>=1.21.21 - - botocore>=1.24.21 - - aiobotocore>=2.2.0 - - s3fs>=2022.3.0 - - werkzeug<2.2.0 # Temporary transient dependency pinning to avoid URL-LIB3 + moto timeouts - - pytorch<1.12.0 - - pip: - - git+https://github.com/python-streamz/streamz.git@master - - pyorc - - cubinlinker # [linux64] - - gcc_linux-64=9.* # [linux64] - - sysroot_linux-64==2.17 # [linux64] - - nvcc_linux-64=11.5 - # Un-comment following lines for ARM specific packages. - # - gcc_linux-aarch64=9.* # [aarch64] - # - sysroot_linux-aarch64==2.17 # [aarch64] - # - nvcc_linux-aarch64=11.5 # [aarch64] diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index 0027a80f1ec..4feac647e8c 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -8,7 +8,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.23.1" + - ">=3.23.1,!=3.25.0" cuda_compiler: - nvcc diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index a65373efec3..4f7a4bbc268 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -22,7 +22,7 @@ build: requirements: build: - - cmake >=3.23.1 + - cmake >=3.23.1,!=3.25.0 - {{ compiler('c') }} - {{ compiler('cxx') }} - sysroot_{{ target_platform }} {{ sysroot_version }} diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 596e5fde940..b5a27cdac92 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -29,8 +29,8 @@ requirements: - python - streamz - cudf ={{ version }} - - dask==2022.9.2 - - distributed==2022.9.2 + - dask==2022.11.1 + - distributed==2022.11.1 - python-confluent-kafka >=1.7.0,<1.8.0a0 - cudf_kafka ={{ version }} diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 2d95151018b..d97a8448a53 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -24,14 +24,14 @@ requirements: host: - python - cudf ={{ version }} - - dask==2022.9.2 - - distributed==2022.9.2 + - dask==2022.11.1 + - distributed==2022.11.1 - cudatoolkit ={{ cuda_version }} run: - python - cudf ={{ version }} - - dask==2022.9.2 - - distributed==2022.9.2 + - dask==2022.11.1 + - distributed==2022.11.1 - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} test: # [linux64] diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 7f5bf219f1f..5179cb55d84 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -11,7 +11,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.23.1" + - ">=3.23.1,!=3.25.0" gtest_version: - "=1.10.0" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index a417b407044..ceafc44ed10 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -78,7 +78,6 @@ outputs: - test -f $PREFIX/include/cudf/detail/binaryop.hpp - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh - test -f $PREFIX/include/cudf/detail/concatenate.hpp - - test -f $PREFIX/include/cudf/detail/copy.cuh - test -f $PREFIX/include/cudf/detail/copy.hpp - test -f $PREFIX/include/cudf/detail/datetime.hpp - test -f $PREFIX/include/cudf/detail/fill.hpp @@ -113,9 +112,11 @@ outputs: - test -f $PREFIX/include/cudf/detail/transpose.hpp - test -f $PREFIX/include/cudf/detail/unary.hpp - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp - - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp + - test -f $PREFIX/include/cudf/detail/utilities/default_stream.hpp - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp + - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp + - test -f $PREFIX/include/cudf/detail/utilities/pinned_allocator.hpp - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp @@ -148,10 +149,12 @@ outputs: - test -f $PREFIX/include/cudf/io/json.hpp - test -f $PREFIX/include/cudf/io/orc.hpp - test -f $PREFIX/include/cudf/io/orc_metadata.hpp + - test -f $PREFIX/include/cudf/io/orc_types.hpp - test -f $PREFIX/include/cudf/io/parquet.hpp - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp + - test -f $PREFIX/include/cudf/io/text/detail/bgzip_utils.hpp - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp @@ -232,6 +235,7 @@ outputs: - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/regex/flags.hpp + - test -f $PREFIX/include/cudf/strings/regex/regex_program.hpp - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp @@ -250,7 +254,7 @@ outputs: - test -f $PREFIX/include/cudf/structs/structs_column_view.hpp - test -f $PREFIX/include/cudf/table/table.hpp - test -f $PREFIX/include/cudf/table/table_view.hpp - - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.cuh + - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.hpp - test -f $PREFIX/include/cudf/transform.hpp - test -f $PREFIX/include/cudf/transpose.hpp - test -f $PREFIX/include/cudf/types.hpp @@ -274,6 +278,7 @@ outputs: - test -f $PREFIX/include/cudf_test/file_utilities.hpp - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp + - test -f $PREFIX/include/cudf_test/stream_checking_resource_adapter.hpp - test -f $PREFIX/include/cudf_test/table_utilities.hpp - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp diff --git a/conda/recipes/strings_udf/conda_build_config.yaml b/conda/recipes/strings_udf/conda_build_config.yaml index 0027a80f1ec..4feac647e8c 100644 --- a/conda/recipes/strings_udf/conda_build_config.yaml +++ b/conda/recipes/strings_udf/conda_build_config.yaml @@ -8,7 +8,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.23.1" + - ">=3.23.1,!=3.25.0" cuda_compiler: - nvcc diff --git a/conda/recipes/strings_udf/meta.yaml b/conda/recipes/strings_udf/meta.yaml index a736edef24d..027a8a82aae 100644 --- a/conda/recipes/strings_udf/meta.yaml +++ b/conda/recipes/strings_udf/meta.yaml @@ -50,7 +50,7 @@ requirements: - cudf ={{ version }} - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }} - cachetools - - ptxcompiler # [linux64] # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler + - ptxcompiler >=0.7.0 # [linux64] # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler test: # [linux64] requires: # [linux64] - cudatoolkit {{ cuda_version }}.* # [linux64] diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fc848ac2f1e..6f4f42f6842 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF) project( CUDF - VERSION 22.10.00 + VERSION 22.12.00 LANGUAGES C CXX CUDA ) if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5) @@ -48,6 +48,8 @@ option(BUILD_TESTS "Configure CMake to build tests" ON) option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" OFF) option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON) option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) +option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) +mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF) @@ -60,14 +62,17 @@ option( stream to external libraries." OFF ) -option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF) +option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF) # Option to enable line info in CUDA device compilation to allow introspection when profiling / # memchecking option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF ) +option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) +option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF) +mark_as_advanced(USE_LIBARROW_FROM_PYARROW) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") @@ -79,7 +84,7 @@ message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}") message( VERBOSE - "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNING}" + "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNINGS}" ) message( VERBOSE @@ -91,6 +96,12 @@ message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}" rapids_cmake_build_type("Release") set(CUDF_BUILD_TESTS ${BUILD_TESTS}) set(CUDF_BUILD_BENCHMARKS ${BUILD_BENCHMARKS}) +if(BUILD_TESTS AND NOT CUDF_BUILD_TESTUTIL) + message( + FATAL_ERROR + "Tests cannot be built without building cudf test utils. Please set CUDF_BUILD_TESTUTIL=ON or BUILD_TESTS=OFF" + ) +endif() set(CUDF_CXX_FLAGS "") set(CUDF_CUDA_FLAGS "") @@ -124,24 +135,20 @@ rapids_find_package( ) include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags -# ctest cuda memcheck -find_program(CUDA_SANITIZER compute-sanitizer) -set(MEMORYCHECK_COMMAND ${CUDA_SANITIZER}) -set(MEMORYCHECK_TYPE CudaSanitizer) -set(CUDA_SANITIZER_COMMAND_OPTIONS "--tool memcheck") - # ################################################################################################## # * dependencies ---------------------------------------------------------------------------------- # find zlib rapids_find_package(ZLIB REQUIRED) -# find Threads (needed by cudftestutil) -rapids_find_package( - Threads REQUIRED - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports -) +if(CUDF_BUILD_TESTUTIL) + # find Threads (needed by cudftestutil) + rapids_find_package( + Threads REQUIRED + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + ) +endif() # add third party dependencies using CPM rapids_cpm_init() @@ -163,7 +170,9 @@ rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-expo # find cuCollections Should come after including thrust and libcudacxx include(cmake/thirdparty/get_cucollections.cmake) # find or install GoogleTest -include(cmake/thirdparty/get_gtest.cmake) +if(CUDF_BUILD_TESTUTIL) + include(cmake/thirdparty/get_gtest.cmake) +endif() # preprocess jitify-able kernels include(cmake/Modules/JitifyPreprocessKernels.cmake) # find cuFile @@ -332,6 +341,7 @@ add_library( src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/reader_impl.cu + src/io/json/experimental/byte_range_info.cu src/io/json/experimental/read_json.cpp src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu @@ -349,13 +359,17 @@ add_library( src/io/parquet/chunk_dict.cu src/io/parquet/page_enc.cu src/io/parquet/page_hdr.cu - src/io/parquet/reader_impl.cu + src/io/parquet/reader.cpp + src/io/parquet/reader_impl.cpp + src/io/parquet/reader_impl_helpers.cpp + src/io/parquet/reader_impl_preprocess.cu src/io/parquet/writer_impl.cu src/io/statistics/orc_column_statistics.cu src/io/statistics/parquet_column_statistics.cu src/io/text/byte_range_info.cpp src/io/text/data_chunk_source_factories.cpp src/io/text/bgzip_data_chunk_source.cu + src/io/text/bgzip_utils.cpp src/io/text/multibyte_split.cu src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp @@ -503,7 +517,8 @@ add_library( src/strings/padding.cu src/strings/json/json_path.cu src/strings/regex/regcomp.cpp - src/strings/regex/regexec.cu + src/strings/regex/regexec.cpp + src/strings/regex/regex_program.cpp src/strings/repeat_strings.cu src/strings/replace/backref_re.cu src/strings/replace/multi_re.cu @@ -616,9 +631,15 @@ target_compile_definitions( # Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79 target_compile_definitions(cudf PRIVATE "JITIFY_PRINT_LOG=0") -# Instruct jitify to use the kernel JIT cache if(JITIFY_USE_CACHE) - target_compile_definitions(cudf PUBLIC JITIFY_USE_CACHE "CUDF_VERSION=${PROJECT_VERSION}") + # Instruct src/jit/cache what version of cudf we are building so it can compute a cal-ver cache + # directory. We isolate this definition to the single source so it doesn't effect compiling + # caching for all of libcudf + set_property( + SOURCE src/jit/cache.cpp + APPEND + PROPERTY COMPILE_DEFINITIONS "JITIFY_USE_CACHE" "CUDF_VERSION=${PROJECT_VERSION}" + ) endif() # Per-thread default stream @@ -685,53 +706,61 @@ add_library(cudf::cudf ALIAS cudf) # ################################################################################################## # * build cudftestutil ---------------------------------------------------------------------------- -add_library( - cudftestutil STATIC - tests/io/metadata_utilities.cpp - tests/quantiles/tdigest_utilities.cu - tests/utilities/base_fixture.cpp - tests/utilities/column_utilities.cu - tests/utilities/table_utilities.cu - tests/strings/utilities.cpp -) +if(CUDF_BUILD_TESTUTIL) + add_library( + cudftestutil STATIC + tests/io/metadata_utilities.cpp + tests/utilities/base_fixture.cpp + tests/utilities/column_utilities.cu + tests/utilities/table_utilities.cu + tests/utilities/tdigest_utilities.cu + ) -set_target_properties( - cudftestutil - PROPERTIES BUILD_RPATH "\$ORIGIN" - INSTALL_RPATH "\$ORIGIN" - # set target compile options - CXX_STANDARD 17 - CXX_STANDARD_REQUIRED ON - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON - POSITION_INDEPENDENT_CODE ON - INTERFACE_POSITION_INDEPENDENT_CODE ON -) + set_target_properties( + cudftestutil + PROPERTIES BUILD_RPATH "\$ORIGIN" + INSTALL_RPATH "\$ORIGIN" + # set target compile options + CXX_STANDARD 17 + CXX_STANDARD_REQUIRED ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + POSITION_INDEPENDENT_CODE ON + INTERFACE_POSITION_INDEPENDENT_CODE ON + ) -target_compile_options( - cudftestutil PUBLIC "$:${CUDF_CXX_FLAGS}>>" - "$:${CUDF_CUDA_FLAGS}>>" -) + target_compile_options( + cudftestutil PUBLIC "$:${CUDF_CXX_FLAGS}>>" + "$:${CUDF_CUDA_FLAGS}>>" + ) -target_link_libraries( - cudftestutil - PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf - PRIVATE $ -) + target_link_libraries( + cudftestutil + PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf + PRIVATE $ + ) -target_include_directories( - cudftestutil PUBLIC "$" - "$" -) + target_include_directories( + cudftestutil PUBLIC "$" + "$" + ) -add_library(cudf::cudftestutil ALIAS cudftestutil) + add_library(cudf::cudftestutil ALIAS cudftestutil) +endif() # ################################################################################################## # * add tests ------------------------------------------------------------------------------------- if(CUDF_BUILD_TESTS) # include CTest module -- automatically calls enable_testing() include(CTest) + + # ctest cuda memcheck + find_program(CUDA_SANITIZER compute-sanitizer) + set(MEMORYCHECK_COMMAND ${CUDA_SANITIZER}) + set(MEMORYCHECK_TYPE CudaSanitizer) + set(CUDA_SANITIZER_COMMAND_OPTIONS "--tool memcheck") + # Always print verbose output when tests fail if run using `make test`. list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure") add_subdirectory(tests) @@ -742,13 +771,8 @@ endif() if(CUDF_BUILD_BENCHMARKS) # Find or install GoogleBench - rapids_cpm_find( - benchmark 1.5.2 - GIT_REPOSITORY https://github.com/google/benchmark.git - GIT_TAG v1.5.2 - GIT_SHALLOW TRUE - OPTIONS "BENCHMARK_ENABLE_TESTING OFF" "BENCHMARK_ENABLE_INSTALL OFF" - ) + include(${rapids-cmake-dir}/cpm/gbench.cmake) + rapids_cpm_gbench() # Find or install NVBench Temporarily force downloading of fmt because current versions of nvbench # do not support the latest version of fmt, which is automatically pulled into our conda @@ -759,11 +783,6 @@ if(CUDF_BUILD_BENCHMARKS) add_subdirectory(benchmarks) endif() -# build pretty-printer load script -if(Thrust_SOURCE_DIR AND rmm_SOURCE_DIR) - configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY) -endif() - # ################################################################################################## # * install targets ------------------------------------------------------------------------------- rapids_cmake_install_lib_dir(lib_dir) @@ -783,24 +802,26 @@ install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cud ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) -install( - TARGETS cudftestutil - DESTINATION ${lib_dir} - EXPORT cudf-testing-exports -) +if(CUDF_BUILD_TESTUTIL) + install( + TARGETS cudftestutil + DESTINATION ${lib_dir} + EXPORT cudf-testing-exports + ) -install( - EXPORT cudf-testing-exports - FILE cudf-testing-targets.cmake - NAMESPACE cudf:: - DESTINATION "${lib_dir}/cmake/cudf" -) + install( + EXPORT cudf-testing-exports + FILE cudf-testing-targets.cmake + NAMESPACE cudf:: + DESTINATION "${lib_dir}/cmake/cudf" + ) -include("${rapids-cmake-dir}/export/write_dependencies.cmake") -rapids_export_write_dependencies( - INSTALL cudf-testing-exports - "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake" -) + include("${rapids-cmake-dir}/export/write_dependencies.cmake") + rapids_export_write_dependencies( + INSTALL cudf-testing-exports + "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake" + ) +endif() set(doc_string [=[ @@ -890,6 +911,7 @@ if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake") endif() ]=] ) + string(APPEND build_code_string "${common_code_string}") rapids_export( @@ -901,15 +923,16 @@ rapids_export( FINAL_CODE_BLOCK build_code_string ) -export( - EXPORT cudf-testing-exports - FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake - NAMESPACE cudf:: -) -rapids_export_write_dependencies( - BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake" -) - +if(CUDF_BUILD_TESTUTIL) + export( + EXPORT cudf-testing-exports + FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake + NAMESPACE cudf:: + ) + rapids_export_write_dependencies( + BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake" + ) +endif() # ################################################################################################## # * make documentation ---------------------------------------------------------------------------- @@ -927,3 +950,11 @@ add_custom_target( DEPENDS CUDF_DOXYGEN COMMENT "Custom command for building cudf doxygen docs." ) + +# ################################################################################################## +# * make gdb helper scripts ------------------------------------------------------------------------ + +# build pretty-printer load script +if(Thrust_SOURCE_DIR AND rmm_SOURCE_DIR) + configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY) +endif() diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index d1ff177a25e..48c9ba5f185 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -169,7 +169,7 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp) # ################################################################################################## # * sort benchmark -------------------------------------------------------------------------------- ConfigureBench(SORT_BENCH sort/rank.cpp sort/sort.cpp sort/sort_strings.cpp) -ConfigureNVBench(SORT_NVBENCH sort/sort_lists.cpp sort/sort_structs.cpp) +ConfigureNVBench(SORT_NVBENCH sort/segmented_sort.cpp sort/sort_lists.cpp sort/sort_structs.cpp) # ################################################################################################## # * quantiles benchmark @@ -301,7 +301,8 @@ ConfigureNVBench(NESTED_JSON_NVBENCH io/json/nested_json.cpp) # ################################################################################################## # * io benchmark --------------------------------------------------------------------- -ConfigureNVBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp) +ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) +target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB) add_custom_target( run_benchmarks diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp index 99aa414fae3..3260159b409 100644 --- a/cpp/benchmarks/column/concatenate.cpp +++ b/cpp/benchmarks/column/concatenate.cpp @@ -49,7 +49,7 @@ static void BM_concatenate(benchmark::State& state) CUDF_CHECK_CUDA(0); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = cudf::concatenate(column_views); } @@ -91,7 +91,7 @@ static void BM_concatenate_tables(benchmark::State& state) CUDF_CHECK_CUDA(0); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = cudf::concatenate(table_views); } @@ -150,7 +150,7 @@ static void BM_concatenate_strings(benchmark::State& state) CUDF_CHECK_CUDA(0); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = cudf::concatenate(column_views); } diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 890a78bb9bf..dee7e2b8586 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -207,7 +207,7 @@ struct random_value_fn()>> { } else { // Don't need a random seconds generator for sub-second intervals seconds_gen = [range_s](thrust::minstd_rand&, size_t size) { - rmm::device_uvector result(size, cudf::default_stream_value); + rmm::device_uvector result(size, cudf::get_default_stream()); thrust::fill(thrust::device, result.begin(), result.end(), range_s.second.count()); return result; }; @@ -225,7 +225,7 @@ struct random_value_fn()>> { { auto const sec = seconds_gen(engine, size); auto const ns = nanoseconds_gen(engine, size); - rmm::device_uvector result(size, cudf::default_stream_value); + rmm::device_uvector result(size, cudf::get_default_stream()); thrust::transform( thrust::device, sec.begin(), @@ -247,40 +247,33 @@ struct random_value_fn()>> { */ template struct random_value_fn()>> { - using rep = typename T::rep; - rep const lower_bound; - rep const upper_bound; - distribution_fn dist; + using DeviceType = cudf::device_storage_type_t; + DeviceType const lower_bound; + DeviceType const upper_bound; + distribution_fn dist; std::optional scale; - random_value_fn(distribution_params const& desc) + random_value_fn(distribution_params const& desc) : lower_bound{desc.lower_bound}, upper_bound{desc.upper_bound}, - dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} + dist{make_distribution(desc.id, desc.lower_bound, desc.upper_bound)} { } - rmm::device_uvector operator()(thrust::minstd_rand& engine, unsigned size) + [[nodiscard]] numeric::scale_type get_scale(thrust::minstd_rand& engine) { if (not scale.has_value()) { - int const max_scale = std::numeric_limits::digits10; + constexpr int max_scale = std::numeric_limits::digits10; std::uniform_int_distribution scale_dist{-max_scale, max_scale}; std::mt19937 engine_scale(engine()); scale = numeric::scale_type{scale_dist(engine_scale)}; } - auto const ints = dist(engine, size); - rmm::device_uvector result(size, cudf::default_stream_value); - // Clamp the generated random value to the specified range - thrust::transform(thrust::device, - ints.begin(), - ints.end(), - result.begin(), - [scale = *(this->scale), - upper_bound = this->upper_bound, - lower_bound = this->lower_bound] __device__(auto int_value) { - return T{std::clamp(int_value, lower_bound, upper_bound), scale}; - }); - return result; + return scale.value_or(numeric::scale_type{0}); + } + + rmm::device_uvector operator()(thrust::minstd_rand& engine, unsigned size) + { + return dist(engine, size); } }; @@ -314,7 +307,7 @@ struct random_value_fn>> { random_value_fn(distribution_params const& desc) : dist{[valid_prob = desc.probability_true](thrust::minstd_rand& engine, size_t size) -> rmm::device_uvector { - rmm::device_uvector result(size, cudf::default_stream_value); + rmm::device_uvector result(size, cudf::get_default_stream()); thrust::tabulate( thrust::device, result.begin(), result.end(), bool_generator(engine, valid_prob)); return result; @@ -366,7 +359,7 @@ rmm::device_uvector sample_indices_with_run_length(cudf::size_t return samples_indices[sample_idx]; }); rmm::device_uvector repeated_sample_indices(num_rows, - cudf::default_stream_value); + cudf::get_default_stream()); thrust::copy(thrust::device, avg_repeated_sample_indices_iterator, avg_repeated_sample_indices_iterator + num_rows, @@ -398,10 +391,18 @@ std::unique_ptr create_random_column(data_profile const& profile, distribution_params{1. - profile.get_null_probability().value_or(0)}); auto value_dist = random_value_fn{profile.get_distribution_params()}; + using DeviceType = cudf::device_storage_type_t; + cudf::data_type const dtype = [&]() { + if constexpr (cudf::is_fixed_point()) + return cudf::data_type{cudf::type_to_id(), value_dist.get_scale(engine)}; + else + return cudf::data_type{cudf::type_to_id()}; + }(); + // Distribution for picking elements from the array of samples auto const avg_run_len = profile.get_avg_run_length(); - rmm::device_uvector data(0, cudf::default_stream_value); - rmm::device_uvector null_mask(0, cudf::default_stream_value); + rmm::device_uvector data(0, cudf::get_default_stream()); + rmm::device_uvector null_mask(0, cudf::get_default_stream()); if (profile.get_cardinality() == 0 and avg_run_len == 1) { data = value_dist(engine, num_rows); @@ -412,12 +413,13 @@ std::unique_ptr create_random_column(data_profile const& profile, : profile_cardinality; }(); rmm::device_uvector samples_null_mask = valid_dist(engine, cardinality); - rmm::device_uvector samples = value_dist(engine, cardinality); + rmm::device_uvector samples = value_dist(engine, cardinality); + // generate n samples and gather. auto const sample_indices = sample_indices_with_run_length(avg_run_len, cardinality, num_rows, engine); - data = rmm::device_uvector(num_rows, cudf::default_stream_value); - null_mask = rmm::device_uvector(num_rows, cudf::default_stream_value); + data = rmm::device_uvector(num_rows, cudf::get_default_stream()); + null_mask = rmm::device_uvector(num_rows, cudf::get_default_stream()); thrust::gather( thrust::device, sample_indices.begin(), sample_indices.end(), samples.begin(), data.begin()); thrust::gather(thrust::device, @@ -427,11 +429,11 @@ std::unique_ptr create_random_column(data_profile const& profile, null_mask.begin()); } - auto [result_bitmask, null_count] = - cudf::detail::valid_if(null_mask.begin(), null_mask.end(), thrust::identity{}); + auto [result_bitmask, null_count] = cudf::detail::valid_if( + null_mask.begin(), null_mask.end(), thrust::identity{}, cudf::get_default_stream()); return std::make_unique( - cudf::data_type{cudf::type_to_id()}, + dtype, num_rows, data.release(), profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); @@ -496,18 +498,18 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons auto valid_lengths = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())), valid_or_zero{}); - rmm::device_uvector offsets(num_rows + 1, cudf::default_stream_value); + rmm::device_uvector offsets(num_rows + 1, cudf::get_default_stream()); thrust::exclusive_scan( thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin()); // offfsets are ready. auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1); - rmm::device_uvector chars(chars_length, cudf::default_stream_value); + rmm::device_uvector chars(chars_length, cudf::get_default_stream()); thrust::for_each_n(thrust::device, thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1), num_rows, string_generator{chars.data(), engine}); - auto [result_bitmask, null_count] = - cudf::detail::valid_if(null_mask.begin(), null_mask.end() - 1, thrust::identity{}); + auto [result_bitmask, null_count] = cudf::detail::valid_if( + null_mask.begin(), null_mask.end() - 1, thrust::identity{}, cudf::get_default_stream()); return cudf::make_strings_column( num_rows, std::move(offsets), @@ -539,7 +541,8 @@ std::unique_ptr create_random_column(data_profi auto str_table = cudf::detail::gather(cudf::table_view{{sample_strings->view()}}, sample_indices, cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED); + cudf::detail::negative_index_policy::NOT_ALLOWED, + cudf::get_default_stream()); return std::move(str_table->release()[0]); } @@ -623,7 +626,8 @@ std::unique_ptr create_random_column(data_profi auto [null_mask, null_count] = [&]() { if (profile.get_null_probability().has_value()) { auto valids = valid_dist(engine, num_rows); - return cudf::detail::valid_if(valids.begin(), valids.end(), thrust::identity{}); + return cudf::detail::valid_if( + valids.begin(), valids.end(), thrust::identity{}, cudf::get_default_stream()); } return std::pair{}; }(); @@ -706,8 +710,8 @@ std::unique_ptr create_random_column(data_profile auto offsets_column = std::make_unique( cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release()); - auto [null_mask, null_count] = - cudf::detail::valid_if(valids.begin(), valids.end(), thrust::identity{}); + auto [null_mask, null_count] = cudf::detail::valid_if( + valids.begin(), valids.end(), thrust::identity{}, cudf::get_default_stream()); list_column = cudf::make_lists_column( num_rows, std::move(offsets_column), @@ -833,7 +837,8 @@ std::pair create_random_null_mask( } else { return cudf::detail::valid_if(thrust::make_counting_iterator(0), thrust::make_counting_iterator(size), - bool_generator{seed, 1.0 - *null_probability}); + bool_generator{seed, 1.0 - *null_probability}, + cudf::get_default_stream()); } } diff --git a/cpp/benchmarks/common/random_distribution_factory.cuh b/cpp/benchmarks/common/random_distribution_factory.cuh index 3cfab858793..36b968c6010 100644 --- a/cpp/benchmarks/common/random_distribution_factory.cuh +++ b/cpp/benchmarks/common/random_distribution_factory.cuh @@ -148,7 +148,7 @@ distribution_fn make_distribution(distribution_id dist_id, T lower_bound, T u case distribution_id::NORMAL: return [lower_bound, upper_bound, dist = make_normal_dist(lower_bound, upper_bound)]( thrust::minstd_rand& engine, size_t size) -> rmm::device_uvector { - rmm::device_uvector result(size, cudf::default_stream_value); + rmm::device_uvector result(size, cudf::get_default_stream()); thrust::tabulate(thrust::device, result.begin(), result.end(), @@ -158,7 +158,7 @@ distribution_fn make_distribution(distribution_id dist_id, T lower_bound, T u case distribution_id::UNIFORM: return [lower_bound, upper_bound, dist = make_uniform_dist(lower_bound, upper_bound)]( thrust::minstd_rand& engine, size_t size) -> rmm::device_uvector { - rmm::device_uvector result(size, cudf::default_stream_value); + rmm::device_uvector result(size, cudf::get_default_stream()); thrust::tabulate(thrust::device, result.begin(), result.end(), @@ -169,7 +169,7 @@ distribution_fn make_distribution(distribution_id dist_id, T lower_bound, T u // kind of exponential distribution from lower_bound to upper_bound. return [lower_bound, upper_bound, dist = geometric_distribution(lower_bound, upper_bound)]( thrust::minstd_rand& engine, size_t size) -> rmm::device_uvector { - rmm::device_uvector result(size, cudf::default_stream_value); + rmm::device_uvector result(size, cudf::get_default_stream()); thrust::tabulate(thrust::device, result.begin(), result.end(), diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp index 82f4e15ecb0..9a153a7094c 100644 --- a/cpp/benchmarks/copying/copy_if_else.cpp +++ b/cpp/benchmarks/copying/copy_if_else.cpp @@ -45,7 +45,7 @@ static void BM_copy_if_else(benchmark::State& state, bool nulls) cudf::column_view lhs(input->view().column(0)); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::copy_if_else(lhs, rhs, decision); } } diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu index a849b7da58b..957313134b3 100644 --- a/cpp/benchmarks/copying/shift.cu +++ b/cpp/benchmarks/copying/shift.cu @@ -24,7 +24,7 @@ template > std::unique_ptr make_scalar( T value = 0, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto s = new ScalarType(value, true, stream, mr); diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp index 8454d1afee6..4956cce0daf 100644 --- a/cpp/benchmarks/groupby/group_max.cpp +++ b/cpp/benchmarks/groupby/group_max.cpp @@ -52,7 +52,7 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list) requests[0].values = vals->view(); requests[0].aggregations.push_back(cudf::make_max_aggregation()); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); } diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp index 1f95b5d5899..05698c04058 100644 --- a/cpp/benchmarks/groupby/group_nunique.cpp +++ b/cpp/benchmarks/groupby/group_nunique.cpp @@ -65,7 +65,7 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list) auto const requests = make_aggregation_request_vector( *vals, cudf::make_nunique_aggregation()); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); }); } diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp index 227a4d5259a..cc6f0faaf41 100644 --- a/cpp/benchmarks/groupby/group_struct_keys.cpp +++ b/cpp/benchmarks/groupby/group_struct_keys.cpp @@ -83,7 +83,7 @@ void bench_groupby_struct_keys(nvbench::state& state) requests[0].aggregations.push_back(cudf::make_min_aggregation()); // Set up nvbench default stream - auto stream = cudf::default_stream_value; + auto stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp index e997bf296c5..1053c2e4694 100644 --- a/cpp/benchmarks/hashing/hash.cpp +++ b/cpp/benchmarks/hashing/hash.cpp @@ -35,7 +35,7 @@ static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls h data->get_column(0).set_null_mask(rmm::device_buffer{}, 0); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::hash(data->view(), hid); } } diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp index 4f895e13f1b..27fea856332 100644 --- a/cpp/benchmarks/io/csv/csv_reader_input.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp @@ -47,7 +47,7 @@ void csv_read_common(DataType const& data_types, cudf::io::csv_reader_options::builder(source_sink.make_source_info()); auto const mem_stats_logger = cudf::memory_stats_logger(); // init stats logger - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { try_drop_l3_cache(); // Drop L3 cache for accurate measurement diff --git a/cpp/benchmarks/io/csv/csv_reader_options.cpp b/cpp/benchmarks/io/csv/csv_reader_options.cpp index b569dc65f3d..04522c16d5c 100644 --- a/cpp/benchmarks/io/csv/csv_reader_options.cpp +++ b/cpp/benchmarks/io/csv/csv_reader_options.cpp @@ -66,7 +66,7 @@ void BM_csv_read_varying_options( size_t const chunk_size = source_sink.size() / num_chunks; cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; auto const mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { try_drop_l3_cache(); // Drop L3 cache for accurate measurement diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp index d02305cf478..54a86094eb7 100644 --- a/cpp/benchmarks/io/csv/csv_writer.cpp +++ b/cpp/benchmarks/io/csv/csv_writer.cpp @@ -21,13 +21,11 @@ #include -// to enable, run cmake with -DBUILD_BENCHMARKS=ON - +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 256 << 20; constexpr cudf::size_type num_cols = 64; -namespace cudf_io = cudf::io; - class CsvWrite : public cudf::benchmark { }; @@ -44,9 +42,9 @@ void BM_csv_write_varying_inout(benchmark::State& state) auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true); - cudf_io::write_csv(options); + cudf::io::csv_writer_options options = + cudf::io::csv_writer_options::builder(source_sink.make_sink_info(), view); + cudf::io::write_csv(options); } state.SetBytesProcessed(data_size * state.iterations()); @@ -74,12 +72,11 @@ void BM_csv_write_varying_options(benchmark::State& state) auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf_io::csv_writer_options options = - cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view) - .include_header(true) + cudf::io::csv_writer_options options = + cudf::io::csv_writer_options::builder(source_sink.make_sink_info(), view) .na_rep(na_per) .rows_per_chunk(rows_per_chunk); - cudf_io::write_csv(options); + cudf::io::write_csv(options); } state.SetBytesProcessed(data_size * state.iterations()); diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index da64c1bbf3c..1a9c7153644 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -23,8 +23,6 @@ #include -namespace cudf_io = cudf::io; - temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"}; std::string random_file_in_dir(std::string const& dir_path) @@ -43,21 +41,21 @@ cuio_source_sink_pair::cuio_source_sink_pair(io_type type) { } -cudf_io::source_info cuio_source_sink_pair::make_source_info() +cudf::io::source_info cuio_source_sink_pair::make_source_info() { switch (type) { - case io_type::FILEPATH: return cudf_io::source_info(file_name); - case io_type::HOST_BUFFER: return cudf_io::source_info(buffer.data(), buffer.size()); + case io_type::FILEPATH: return cudf::io::source_info(file_name); + case io_type::HOST_BUFFER: return cudf::io::source_info(buffer.data(), buffer.size()); default: CUDF_FAIL("invalid input type"); } } -cudf_io::sink_info cuio_source_sink_pair::make_sink_info() +cudf::io::sink_info cuio_source_sink_pair::make_sink_info() { switch (type) { - case io_type::VOID: return cudf_io::sink_info(&void_sink); - case io_type::FILEPATH: return cudf_io::sink_info(file_name); - case io_type::HOST_BUFFER: return cudf_io::sink_info(&buffer); + case io_type::VOID: return cudf::io::sink_info(&void_sink); + case io_type::FILEPATH: return cudf::io::sink_info(file_name); + case io_type::HOST_BUFFER: return cudf::io::sink_info(&buffer); default: CUDF_FAIL("invalid output type"); } } diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp index bb3e13a3a01..1fe0218bb0f 100644 --- a/cpp/benchmarks/io/json/nested_json.cpp +++ b/cpp/benchmarks/io/json/nested_json.cpp @@ -68,16 +68,16 @@ void BM_NESTED_JSON(nvbench::state& state) auto const string_size{size_type(state.get_int64("string_size"))}; auto const default_options = cudf::io::json_reader_options{}; - auto input = make_test_json_data(string_size, cudf::default_stream_value); + auto input = make_test_json_data(string_size, cudf::get_default_stream()); state.add_element_count(input.size()); // Run algorithm auto const mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { // Allocate device-side temporary storage & run algorithm cudf::io::json::detail::device_parse_nested_json( - input, default_options, cudf::default_stream_value); + input, default_options, cudf::get_default_stream()); }); auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp index 46f14cc4874..f1aaf506a60 100644 --- a/cpp/benchmarks/io/orc/orc_reader_input.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp @@ -25,6 +25,8 @@ #include +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; @@ -38,7 +40,7 @@ void orc_read_common(cudf::io::orc_writer_options const& opts, cudf::io::orc_reader_options::builder(source_sink.make_source_info()); auto mem_stats_logger = cudf::memory_stats_logger(); // init stats logger - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { try_drop_l3_cache(); diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp index da64fdcac3a..1b7d33ccd19 100644 --- a/cpp/benchmarks/io/orc/orc_reader_options.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp @@ -21,17 +21,27 @@ #include #include +#include #include #include +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; +// The number of separate read calls to use when reading files in multiple chunks +// Each call reads roughly equal amounts of data +constexpr int32_t chunked_read_num_chunks = 8; std::vector get_col_names(cudf::io::source_info const& source) { - cudf::io::orc_reader_options const read_options = - cudf::io::orc_reader_options::builder(source).num_rows(1); - return cudf::io::read_orc(read_options).metadata.column_names; + auto const top_lvl_cols = cudf::io::read_orc_metadata(source).schema().root().children(); + std::vector col_names; + std::transform(top_lvl_cols.cbegin(), + top_lvl_cols.cend(), + std::back_inserter(col_names), + [](auto const& col_meta) { return col_meta.name(); }); + return col_names; } template ; +using row_selections = + nvbench::enum_type_list; NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, NVBENCH_TYPE_AXES(col_selections, @@ -141,11 +149,22 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"}) .set_min_samples(4); +NVBENCH_BENCH_TYPES(BM_orc_read_varying_options, + NVBENCH_TYPE_AXES(nvbench::enum_type_list, + row_selections, + nvbench::enum_type_list, + nvbench::enum_type_list, + nvbench::enum_type_list)) + .set_name("orc_read_row_selection") + .set_type_axes_names( + {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"}) + .set_min_samples(4); + NVBENCH_BENCH_TYPES( BM_orc_read_varying_options, NVBENCH_TYPE_AXES( nvbench::enum_type_list, - nvbench::enum_type_list, + nvbench::enum_type_list, nvbench::enum_type_list, nvbench::enum_type_list, nvbench::enum_type_list)) diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index ddf699b0eaa..545f8d10122 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -38,6 +38,8 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( }, [](auto) { return std::string{}; }) +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; @@ -61,7 +63,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list -// to enable, run cmake with -DBUILD_BENCHMARKS=ON - +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; void nvbench_orc_write(nvbench::state& state) @@ -58,7 +58,7 @@ void nvbench_orc_write(nvbench::state& state) size_t encoded_file_size = 0; - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) { cuio_source_sink_pair source_sink(io_type::VOID); @@ -112,7 +112,7 @@ void nvbench_orc_chunked_write(nvbench::state& state) size_t encoded_file_size = 0; - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec( nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) { cuio_source_sink_pair source_sink(io_type::VOID); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 6477f611421..7a4e649d4fb 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -25,6 +25,8 @@ #include +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; @@ -38,7 +40,7 @@ void parquet_read_common(cudf::io::parquet_writer_options const& write_opts, cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); auto mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { try_drop_l3_cache(); diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp index 3c1e41c89b8..b5e4f6d8f2b 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp @@ -25,6 +25,8 @@ #include +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr std::size_t data_size = 512 << 20; constexpr std::size_t row_group_size = 128 << 20; @@ -86,7 +88,7 @@ void BM_parquet_read_options(nvbench::state& state, auto constexpr num_chunks = 1; auto mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec( nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { try_drop_l3_cache(); diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 747dd5c086c..a0b076abfda 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -25,8 +25,6 @@ #include -// to enable, run cmake with -DBUILD_BENCHMARKS=ON - NVBENCH_DECLARE_ENUM_TYPE_STRINGS( cudf::io::statistics_freq, [](auto value) { @@ -34,11 +32,14 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS( case cudf::io::statistics_freq::STATISTICS_NONE: return "STATISTICS_NONE"; case cudf::io::statistics_freq::STATISTICS_ROWGROUP: return "STATISTICS_ROWGROUP"; case cudf::io::statistics_freq::STATISTICS_PAGE: return "STATISTICS_PAGE"; + case cudf::io::statistics_freq::STATISTICS_COLUMN: return "STATISTICS_COLUMN"; default: return "Unknown"; } }, [](auto) { return std::string{}; }) +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr size_t data_size = 512 << 20; constexpr cudf::size_type num_cols = 64; @@ -62,7 +63,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list; NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp index 6c8500a2a70..11b29cc2297 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp @@ -27,8 +27,8 @@ #include -// to enable, run cmake with -DBUILD_BENCHMARKS=ON - +// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to +// run on most GPUs, but large enough to allow highest throughput constexpr int64_t data_size = 512 << 20; void PQ_write(nvbench::state& state) @@ -44,7 +44,7 @@ void PQ_write(nvbench::state& state) std::size_t encoded_file_size = 0; auto const mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) { cuio_source_sink_pair source_sink(io_type::VOID); @@ -81,7 +81,7 @@ void PQ_write_chunked(nvbench::state& state) auto const mem_stats_logger = cudf::memory_stats_logger(); std::size_t encoded_file_size = 0; - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec( nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) { cuio_source_sink_pair source_sink(io_type::VOID); diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index 4865d11ae8b..75db8e36689 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -23,8 +23,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -32,7 +34,6 @@ #include #include -#include #include #include @@ -40,10 +41,26 @@ #include #include #include +#include temp_directory const temp_dir("cudf_nvbench"); -enum class data_chunk_source_type { device, file, host, host_pinned }; +enum class data_chunk_source_type { device, file, file_datasource, host, host_pinned, file_bgzip }; + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + data_chunk_source_type, + [](auto value) { + switch (value) { + case data_chunk_source_type::device: return "device"; + case data_chunk_source_type::file: return "file"; + case data_chunk_source_type::file_datasource: return "file_datasource"; + case data_chunk_source_type::host: return "host"; + case data_chunk_source_type::host_pinned: return "host_pinned"; + case data_chunk_source_type::file_bgzip: return "file_bgzip"; + default: return "Unknown"; + } + }, + [](auto) { return std::string{}; }) static cudf::string_scalar create_random_input(int32_t num_chars, double delim_factor, @@ -78,15 +95,34 @@ static cudf::string_scalar create_random_input(int32_t num_chars, return cudf::string_scalar(std::move(*chars_buffer)); } -static void bench_multibyte_split(nvbench::state& state) +static void write_bgzip_file(cudf::host_span host_data, std::ostream& output_stream) +{ + // a bit of variability with a decent amount of padding so we don't overflow 16 bit block sizes + std::uniform_int_distribution chunk_size_dist{64000, 65000}; + std::default_random_engine rng{}; + std::size_t pos = 0; + while (pos < host_data.size()) { + auto const remainder = host_data.size() - pos; + auto const chunk_size = std::min(remainder, chunk_size_dist(rng)); + cudf::io::text::detail::bgzip::write_compressed_block(output_stream, + {host_data.data() + pos, chunk_size}); + pos += chunk_size; + } + // empty block denotes EOF + cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {}); +} + +template +static void bench_multibyte_split(nvbench::state& state, + nvbench::type_list>) { cudf::rmm_pool_raii pool_raii; - auto const source_type = static_cast(state.get_int64("source_type")); - auto const delim_size = state.get_int64("delim_size"); - auto const delim_percent = state.get_int64("delim_percent"); - auto const file_size_approx = state.get_int64("size_approx"); + auto const delim_size = state.get_int64("delim_size"); + auto const delim_percent = state.get_int64("delim_percent"); + auto const file_size_approx = state.get_int64("size_approx"); auto const byte_range_percent = state.get_int64("byte_range_percent"); + auto const strip_delimiters = bool(state.get_int64("strip_delimiters")); auto const byte_range_factor = static_cast(byte_range_percent) / 100; CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1"); @@ -99,15 +135,16 @@ static void bench_multibyte_split(nvbench::state& state) std::iota(delim.begin(), delim.end(), '1'); auto const delim_factor = static_cast(delim_percent) / 100; - auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim); - auto host_input = std::vector{}; - auto host_pinned_input = - thrust::host_vector>{}; + std::unique_ptr datasource; + auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim); + auto host_input = std::vector{}; + auto host_pinned_input = thrust::host_vector>{}; - if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file) { + if (source_type != data_chunk_source_type::device && + source_type != data_chunk_source_type::host_pinned) { host_input = cudf::detail::make_std_vector_sync( {device_input.data(), static_cast(device_input.size())}, - cudf::default_stream_value); + cudf::get_default_stream()); } if (source_type == data_chunk_source_type::host_pinned) { host_pinned_input.resize(static_cast(device_input.size())); @@ -119,11 +156,17 @@ static void bench_multibyte_split(nvbench::state& state) auto source = [&] { switch (source_type) { - case data_chunk_source_type::file: { + case data_chunk_source_type::file: + case data_chunk_source_type::file_datasource: { auto const temp_file_name = random_file_in_dir(temp_dir.path()); std::ofstream(temp_file_name, std::ofstream::out) .write(host_input.data(), host_input.size()); - return cudf::io::text::make_source_from_file(temp_file_name); + if (source_type == data_chunk_source_type::file) { + return cudf::io::text::make_source_from_file(temp_file_name); + } else { + datasource = cudf::io::datasource::create(temp_file_name); + return cudf::io::text::make_source(*datasource); + } } case data_chunk_source_type::host: // return cudf::io::text::make_source(host_input); @@ -131,6 +174,14 @@ static void bench_multibyte_split(nvbench::state& state) return cudf::io::text::make_source(host_pinned_input); case data_chunk_source_type::device: // return cudf::io::text::make_source(device_input); + case data_chunk_source_type::file_bgzip: { + auto const temp_file_name = random_file_in_dir(temp_dir.path()); + { + std::ofstream output_stream(temp_file_name, std::ofstream::out); + write_bgzip_file(host_input, output_stream); + } + return cudf::io::text::make_source_from_bgzip_file(temp_file_name); + } default: CUDF_FAIL(); } }(); @@ -139,12 +190,13 @@ static void bench_multibyte_split(nvbench::state& state) auto const range_size = static_cast(device_input.size() * byte_range_factor); auto const range_offset = (device_input.size() - range_size) / 2; cudf::io::text::byte_range_info range{range_offset, range_size}; + cudf::io::text::parse_options options{range, strip_delimiters}; std::unique_ptr output; - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { try_drop_l3_cache(); - output = cudf::io::text::multibyte_split(*source, delim, range); + output = cudf::io::text::multibyte_split(*source, delim, options); }); state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage"); @@ -152,13 +204,16 @@ static void bench_multibyte_split(nvbench::state& state) state.add_buffer_size(range_size, "efs", "Encoded file size"); } -NVBENCH_BENCH(bench_multibyte_split) +using source_type_list = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list)) .set_name("multibyte_split") - .add_int64_axis("source_type", - {static_cast(data_chunk_source_type::device), - static_cast(data_chunk_source_type::file), - static_cast(data_chunk_source_type::host), - static_cast(data_chunk_source_type::host_pinned)}) + .add_int64_axis("strip_delimiters", {0, 1}) .add_int64_axis("delim_size", {1, 4, 7}) .add_int64_axis("delim_percent", {1, 25}) .add_int64_power_of_two_axis("size_approx", {15, 30}) diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu index c121d070ca0..73060200d00 100644 --- a/cpp/benchmarks/iterator/iterator.cu +++ b/cpp/benchmarks/iterator/iterator.cu @@ -56,7 +56,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite nullptr, temp_storage_bytes, d_in, result, num_items, cudf::DeviceSum{}, init); // Allocate temporary storage - rmm::device_buffer d_temp_storage(temp_storage_bytes, cudf::default_stream_value); + rmm::device_buffer d_temp_storage(temp_storage_bytes, cudf::get_default_stream()); // Run reduction cub::DeviceReduce::Reduce( @@ -140,7 +140,8 @@ void BM_iterator(benchmark::State& state) cudf::column_view hasnull_F = wrap_hasnull_F; // Initialize dev_result to false - auto dev_result = cudf::detail::make_zeroed_device_uvector_sync(1); + auto dev_result = + cudf::detail::make_zeroed_device_uvector_sync(1, cudf::get_default_stream()); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 if (cub_or_thrust) { @@ -208,7 +209,8 @@ void BM_pair_iterator(benchmark::State& state) cudf::column_view hasnull_T = wrap_hasnull_T; // Initialize dev_result to false - auto dev_result = cudf::detail::make_zeroed_device_uvector_sync>(1); + auto dev_result = cudf::detail::make_zeroed_device_uvector_sync>( + 1, cudf::get_default_stream()); for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 if (cub_or_thrust) { diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu index 3c4208bf0fc..547367ffb69 100644 --- a/cpp/benchmarks/join/conditional_join.cu +++ b/cpp/benchmarks/join/conditional_join.cu @@ -70,7 +70,7 @@ CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_ cudf::table_view const& right, \ cudf::ast::operation binary_pred, \ cudf::null_equality compare_nulls) { \ - return cudf::conditional_inner_join(left, right, binary_pred); \ + return cudf::conditional_full_join(left, right, binary_pred); \ }; \ constexpr bool is_conditional = true; \ BM_join(st, join); \ diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh index 31cef581f22..c606cd8b4c0 100644 --- a/cpp/benchmarks/join/generate_input_tables.cuh +++ b/cpp/benchmarks/join/generate_input_tables.cuh @@ -154,7 +154,7 @@ void generate_input_tables(key_type* const build_tbl, const int num_states = num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size; - rmm::device_uvector devStates(num_states, cudf::default_stream_value); + rmm::device_uvector devStates(num_states, cudf::get_default_stream()); init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states); diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp index 1a87c2d1158..ad288edb169 100644 --- a/cpp/benchmarks/join/join_common.hpp +++ b/cpp/benchmarks/join/join_common.hpp @@ -86,7 +86,9 @@ static void BM_join(state_type& state, Join JoinFunc) // roughly 75% nulls auto validity = thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{}); - return cudf::detail::valid_if(validity, validity + size, thrust::identity{}).first; + return cudf::detail::valid_if( + validity, validity + size, thrust::identity{}, cudf::get_default_stream()) + .first; }; std::unique_ptr build_key_column0 = [&]() { @@ -142,7 +144,7 @@ static void BM_join(state_type& state, Join JoinFunc) // Benchmark the inner join operation if constexpr (std::is_same_v and (not is_conditional)) { for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = JoinFunc(probe_table.select(columns_to_join), build_table.select(columns_to_join), @@ -168,7 +170,7 @@ static void BM_join(state_type& state, Join JoinFunc) cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL); diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu index 823693721a0..02ad97fee11 100644 --- a/cpp/benchmarks/lists/copying/scatter_lists.cu +++ b/cpp/benchmarks/lists/copying/scatter_lists.cu @@ -40,7 +40,7 @@ class ScatterLists : public cudf::benchmark { template void BM_lists_scatter(::benchmark::State& state) { - auto stream = cudf::default_stream_value; + auto stream = cudf::get_default_stream(); auto mr = rmm::mr::get_current_device_resource(); const size_type base_size{(size_type)state.range(0)}; @@ -108,7 +108,7 @@ void BM_lists_scatter(::benchmark::State& state) for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - scatter(table_view{{*source}}, *scatter_map, table_view{{*target}}, false, mr); + scatter(table_view{{*source}}, *scatter_map, table_view{{*target}}, mr); } state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * 2 * diff --git a/cpp/benchmarks/quantiles/quantiles.cpp b/cpp/benchmarks/quantiles/quantiles.cpp index 7c0a88584f8..599cff2bcda 100644 --- a/cpp/benchmarks/quantiles/quantiles.cpp +++ b/cpp/benchmarks/quantiles/quantiles.cpp @@ -50,7 +50,7 @@ static void BM_quantiles(benchmark::State& state, bool nulls) thrust::seq, q.begin(), q.end(), [n_quantiles](auto i) { return i * (1.0f / n_quantiles); }); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = cudf::quantiles(input, q); // auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input); diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp index 80a85b0f217..755fa1ca2ad 100644 --- a/cpp/benchmarks/reduction/anyall.cpp +++ b/cpp/benchmarks/reduction/anyall.cpp @@ -41,7 +41,7 @@ void BM_reduction_anyall(benchmark::State& state, for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::reduce(*values, agg, output_dtype); + auto result = cudf::reduce(*values, *agg, output_dtype); } } diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp index 219564d6b5c..8f2f0be33ca 100644 --- a/cpp/benchmarks/reduction/dictionary.cpp +++ b/cpp/benchmarks/reduction/dictionary.cpp @@ -51,7 +51,7 @@ void BM_reduction_dictionary(benchmark::State& state, for (auto _ : state) { cuda_event_timer timer(state, true); - auto result = cudf::reduce(*values, agg, output_dtype); + auto result = cudf::reduce(*values, *agg, output_dtype); } } diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index c20f728e018..5022e029d97 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -61,4 +61,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_scan, NVBENCH_TYPE_AXES(data_type)) 1000000, // 1M 10000000, // 10M 100000000, // 100M - }); \ No newline at end of file + }); diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp index 4e354352c11..4dfa7f0bbdc 100644 --- a/cpp/benchmarks/reduction/reduce.cpp +++ b/cpp/benchmarks/reduction/reduce.cpp @@ -45,7 +45,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr(), cudf::scan_type::INCLUSIVE); + *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); } } diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segment_reduce.cu index d2c15c87c2b..e063adb25f9 100644 --- a/cpp/benchmarks/reduction/segment_reduce.cu +++ b/cpp/benchmarks/reduction/segment_reduce.cu @@ -109,7 +109,7 @@ void BM_Simple_Segmented_Reduction(nvbench::state& state, auto const input_view = input->view(); auto const offset_span = cudf::device_span{offsets}; - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec( nvbench::exec_tag::sync, [input_view, output_type, offset_span, &agg](nvbench::launch& launch) { segmented_reduce(input_view, offset_span, *agg, output_type, cudf::null_policy::INCLUDE); diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp index 66277443800..2c26f4fa15d 100644 --- a/cpp/benchmarks/sort/rank.cpp +++ b/cpp/benchmarks/sort/rank.cpp @@ -37,7 +37,7 @@ static void BM_rank(benchmark::State& state, bool nulls) auto keys = create_random_column(cudf::type_to_id(), row_count{n_rows}, profile); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = cudf::rank(keys->view(), cudf::rank_method::FIRST, diff --git a/cpp/benchmarks/sort/segmented_sort.cpp b/cpp/benchmarks/sort/segmented_sort.cpp new file mode 100644 index 00000000000..7162269853c --- /dev/null +++ b/cpp/benchmarks/sort/segmented_sort.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +void nvbench_segmented_sort(nvbench::state& state) +{ + cudf::rmm_pool_raii pool_raii; + + auto const dtype = cudf::type_to_id(); + auto const size_bytes = static_cast(state.get_int64("size_bytes")); + auto const null_freq = state.get_float64("null_frequency"); + auto const row_width = static_cast(state.get_int64("row_width")); + + data_profile const table_profile = + data_profile_builder().null_probability(null_freq).distribution( + dtype, distribution_id::UNIFORM, 0, 10); + auto const input = + create_random_table({cudf::type_id::INT32}, table_size_bytes{size_bytes}, table_profile); + auto const rows = input->num_rows(); + + auto const segments = cudf::sequence((rows / row_width) + 1, + cudf::numeric_scalar(0), + cudf::numeric_scalar(row_width)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.add_element_count(size_bytes, "bytes"); + state.add_global_memory_reads(rows * row_width); + state.add_global_memory_writes(rows); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = cudf::segmented_sorted_order(*input, *segments); + }); +} + +NVBENCH_BENCH(nvbench_segmented_sort) + .set_name("segmented_sort") + .add_int64_power_of_two_axis("size_bytes", {16, 18, 20, 22, 24, 28}) + .add_float64_axis("null_frequency", {0, 0.1}) + .add_int64_axis("row_width", {16, 128, 1024}); diff --git a/cpp/benchmarks/sort/sort.cpp b/cpp/benchmarks/sort/sort.cpp index 13502ce0959..304bac06632 100644 --- a/cpp/benchmarks/sort/sort.cpp +++ b/cpp/benchmarks/sort/sort.cpp @@ -42,7 +42,7 @@ static void BM_sort(benchmark::State& state, bool nulls) cudf::table_view input{*input_table}; for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input); } diff --git a/cpp/benchmarks/sort/sort_strings.cpp b/cpp/benchmarks/sort/sort_strings.cpp index 701b392f80b..572c05d69cb 100644 --- a/cpp/benchmarks/sort/sort_strings.cpp +++ b/cpp/benchmarks/sort/sort_strings.cpp @@ -32,7 +32,7 @@ static void BM_sort(benchmark::State& state) auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows}); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::sort(table->view()); } } diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index ad837bc4caa..512554ff1bc 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -18,8 +18,8 @@ #include #include -#include #include +#include #include #include @@ -41,14 +41,13 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list) auto input_column = source_column->view(); auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::distinct(input_table, - {0}, - cudf::duplicate_keep_option::KEEP_ANY, - cudf::null_equality::EQUAL, - cudf::nan_equality::ALL_EQUAL, - stream_view); + auto result = cudf::distinct(input_table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); }); } @@ -84,14 +83,13 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) auto const table = create_random_table( {dtype}, table_size_bytes{static_cast(size)}, data_profile{builder}, 0); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = cudf::detail::distinct(*table, - {0}, - cudf::duplicate_keep_option::KEEP_ANY, - cudf::null_equality::EQUAL, - cudf::nan_equality::ALL_EQUAL, - stream_view); + auto result = cudf::distinct(*table, + {0}, + cudf::duplicate_keep_option::KEEP_ANY, + cudf::null_equality::EQUAL, + cudf::nan_equality::ALL_EQUAL); }); } diff --git a/cpp/benchmarks/stream_compaction/unique.cpp b/cpp/benchmarks/stream_compaction/unique.cpp index 6b586581408..652d55fb8ce 100644 --- a/cpp/benchmarks/stream_compaction/unique.cpp +++ b/cpp/benchmarks/stream_compaction/unique.cpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include @@ -62,10 +62,9 @@ void nvbench_unique(nvbench::state& state, nvbench::type_listview(); auto input_table = cudf::table_view({input_column, input_column, input_column, input_column}); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; - auto result = - cudf::detail::unique(input_table, {0}, Keep, cudf::null_equality::EQUAL, stream_view); + auto result = cudf::unique(input_table, {0}, Keep, cudf::null_equality::EQUAL); }); } diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp index 1c43fa0f077..72b6fcaff0e 100644 --- a/cpp/benchmarks/string/case.cpp +++ b/cpp/benchmarks/string/case.cpp @@ -32,7 +32,7 @@ static void BM_case(benchmark::State& state) cudf::strings_column_view input(column->view()); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::strings::to_lower(input); } diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp index a8d0224916b..46bcda9ae92 100644 --- a/cpp/benchmarks/string/combine.cpp +++ b/cpp/benchmarks/string/combine.cpp @@ -41,7 +41,7 @@ static void BM_combine(benchmark::State& state) cudf::string_scalar separator("+"); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::strings::concatenate(table->view(), separator); } diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp index fd04d599e5e..f7f394ea048 100644 --- a/cpp/benchmarks/string/contains.cpp +++ b/cpp/benchmarks/string/contains.cpp @@ -85,7 +85,7 @@ static void BM_contains(benchmark::State& state, contains_type ct) auto pattern = patterns[pattern_index]; for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (ct) { case contains_type::contains: // contains_re and matches_re use the same main logic cudf::strings::contains_re(input, pattern); diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu index 318d2d524a3..669b12aa56b 100644 --- a/cpp/benchmarks/string/copy.cu +++ b/cpp/benchmarks/string/copy.cu @@ -58,7 +58,7 @@ static void BM_copy(benchmark::State& state, copy_type ct) thrust::default_random_engine()); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (ct) { case gather: cudf::gather(source->view(), index_map); break; case scatter: cudf::scatter(source->view(), index_map, target->view()); break; diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu index 0e937b91e98..b75de16e901 100644 --- a/cpp/benchmarks/string/factory.cu +++ b/cpp/benchmarks/string/factory.cu @@ -55,7 +55,7 @@ static void BM_factory(benchmark::State& state) cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); auto d_column = cudf::column_device_view::create(column->view()); - rmm::device_uvector pairs(d_column->size(), cudf::default_stream_value); + rmm::device_uvector pairs(d_column->size(), cudf::get_default_stream()); thrust::transform(thrust::device, d_column->pair_begin(), d_column->pair_end(), @@ -63,7 +63,7 @@ static void BM_factory(benchmark::State& state) string_view_to_pair{}); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::make_strings_column(pairs); } diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp index 4001fef5da6..cb805539651 100644 --- a/cpp/benchmarks/string/filter.cpp +++ b/cpp/benchmarks/string/filter.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "string_bench_args.hpp" + #include #include #include @@ -27,7 +29,6 @@ #include #include -#include #include enum FilterAPI { filter, filter_chars, strip }; @@ -49,7 +50,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api) {cudf::char_utf8{'a'}, cudf::char_utf8{'c'}}}; for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (api) { case filter: cudf::strings::filter_characters_of_type(input, types); break; case filter_chars: cudf::strings::filter_characters(input, filter_table); break; @@ -62,21 +63,14 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api) static void generate_bench_args(benchmark::internal::Benchmark* b) { - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 1 << 5; - int const max_rowlen = 1 << 13; - int const len_mult = 4; - for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) { - for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) { - // avoid generating combinations that exceed the cudf column limit - size_t total_chars = static_cast(row_count) * rowlen; - if (total_chars < static_cast(std::numeric_limits::max())) { - b->Args({row_count, rowlen}); - } - } - } + int const min_rows = 1 << 12; + int const max_rows = 1 << 24; + int const row_multiplier = 8; + int const min_length = 1 << 5; + int const max_length = 1 << 13; + int const length_multiplier = 2; + generate_string_bench_args( + b, min_rows, max_rows, row_multiplier, min_length, max_length, length_multiplier); } #define STRINGS_BENCHMARK_DEFINE(name) \ diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp index 62c76d18e1a..4ff3b59a491 100644 --- a/cpp/benchmarks/string/find.cpp +++ b/cpp/benchmarks/string/find.cpp @@ -45,7 +45,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api) cudf::test::strings_column_wrapper targets({"+", "-"}); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (find_api) { case find: cudf::strings::find(input, target); break; case find_multi: diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu index 5ee56c3cdae..d7c0066eb33 100644 --- a/cpp/benchmarks/string/json.cu +++ b/cpp/benchmarks/string/json.cu @@ -177,7 +177,8 @@ auto build_json_string_column(int desired_bytes, int num_rows) auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2)); json_benchmark_row_builder jb{ desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order}; - auto children = cudf::strings::detail::make_strings_children(jb, num_rows); + auto children = cudf::strings::detail::make_strings_children( + jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); return cudf::make_strings_column( num_rows, std::move(children.first), std::move(children.second), 0, {}); } diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp index f6649b186a4..de7382f5a75 100644 --- a/cpp/benchmarks/string/like.cpp +++ b/cpp/benchmarks/string/like.cpp @@ -81,7 +81,7 @@ static void bench_like(nvbench::state& state) // This pattern forces reading the entire target string (when matched expected) auto pattern = std::string("% 5W4_"); // regex equivalent: ".* 5W4." - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value())); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); // gather some throughput statistics as well auto chars_size = input.chars_size(); state.add_element_count(chars_size, "chars_size"); // number of bytes; diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp index db02fec13c2..1844e93bc53 100644 --- a/cpp/benchmarks/string/repeat_strings.cpp +++ b/cpp/benchmarks/string/repeat_strings.cpp @@ -55,7 +55,7 @@ static void BM_repeat_strings_scalar_times(benchmark::State& state) auto const strings_col = cudf::strings_column_view(table->view().column(0)); for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value); + [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::strings::repeat_strings(strings_col, default_repeat_times); } @@ -71,7 +71,7 @@ static void BM_repeat_strings_column_times(benchmark::State& state) auto const repeat_times_col = table->view().column(1); for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value); + [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::strings::repeat_strings(strings_col, repeat_times_col); } @@ -88,7 +88,7 @@ static void BM_compute_output_strings_sizes(benchmark::State& state) auto const repeat_times_col = table->view().column(1); for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value); + [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col); } @@ -107,7 +107,7 @@ static void BM_repeat_strings_column_times_precomputed_sizes(benchmark::State& s cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col); for ([[maybe_unused]] auto _ : state) { - [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value); + [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::strings::repeat_strings(strings_col, repeat_times_col, *sizes); } diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp index e25bf679dbc..b25af14ec2a 100644 --- a/cpp/benchmarks/string/replace.cpp +++ b/cpp/benchmarks/string/replace.cpp @@ -48,7 +48,7 @@ static void BM_replace(benchmark::State& state, replace_type rt) cudf::test::strings_column_wrapper repls({"", ""}); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (rt) { case scalar: cudf::strings::replace(input, target, repl); break; case slice: cudf::strings::replace_slice(input, repl, 1, 10); break; diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp index f8b03daa338..7e9d6036750 100644 --- a/cpp/benchmarks/string/replace_re.cpp +++ b/cpp/benchmarks/string/replace_re.cpp @@ -42,7 +42,7 @@ static void BM_replace(benchmark::State& state, replace_type rt) cudf::test::strings_column_wrapper repls({"#", ""}); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (rt) { case replace_type::replace_re: // contains_re and matches_re use the same main logic cudf::strings::replace_re(input, "\\d+"); diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp index 3a7a96b025d..0f005c462cc 100644 --- a/cpp/benchmarks/string/split.cpp +++ b/cpp/benchmarks/string/split.cpp @@ -43,7 +43,7 @@ static void BM_split(benchmark::State& state, split_type rt) cudf::string_scalar target("+"); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (rt) { case split: cudf::strings::split(input, target); break; case split_ws: cudf::strings::split(input); break; diff --git a/cpp/benchmarks/string/substring.cpp b/cpp/benchmarks/string/substring.cpp index 7ae5ad6f581..1201b240013 100644 --- a/cpp/benchmarks/string/substring.cpp +++ b/cpp/benchmarks/string/substring.cpp @@ -52,7 +52,7 @@ static void BM_substring(benchmark::State& state, substring_type rt) cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (rt) { case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break; case multi_position: cudf::strings::slice_strings(input, starts, stops); break; diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp index 359a3756ef2..efc2fa3154b 100644 --- a/cpp/benchmarks/string/translate.cpp +++ b/cpp/benchmarks/string/translate.cpp @@ -53,7 +53,7 @@ static void BM_translate(benchmark::State& state, int entry_count) }); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); cudf::strings::translate(input, entries); } diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu index a884bc8b587..44681c924d0 100644 --- a/cpp/benchmarks/string/url_decode.cu +++ b/cpp/benchmarks/string/url_decode.cu @@ -91,7 +91,7 @@ void BM_url_decode(benchmark::State& state, int esc_seq_pct) auto strings_view = cudf::strings_column_view(column->view()); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); auto result = cudf::strings::url_decode(strings_view); } diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp index e5882ff1c16..e56d881d459 100644 --- a/cpp/benchmarks/synchronization/synchronization.hpp +++ b/cpp/benchmarks/synchronization/synchronization.hpp @@ -35,7 +35,7 @@ for (auto _ : state){ // default stream, could be another stream - rmm::cuda_stream_view stream{cudf::default_stream_value}; + rmm::cuda_stream_view stream{cudf::get_default_stream()}; // Create (Construct) an object of this class. You HAVE to pass in the // benchmark::State object you are using. It measures the time from its @@ -58,8 +58,7 @@ */ -#ifndef CUDF_BENCH_SYNCHRONIZATION_H -#define CUDF_BENCH_SYNCHRONIZATION_H +#pragma once // Google Benchmark library #include @@ -85,7 +84,7 @@ class cuda_event_timer { */ cuda_event_timer(benchmark::State& state, bool flush_l2_cache, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); // The user must provide a benchmark::State object to set // the timer so we disable the default c'tor. @@ -102,5 +101,3 @@ class cuda_event_timer { rmm::cuda_stream_view stream; benchmark::State* p_state; }; - -#endif diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index e5a0a1a95f4..91d873224d3 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -37,7 +37,7 @@ static void BM_normalize(benchmark::State& state, bool to_lower) cudf::strings_column_view input(column->view()); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); nvtext::normalize_characters(input, to_lower); } diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp index 414cd119575..85eaf54d4ea 100644 --- a/cpp/benchmarks/text/normalize_spaces.cpp +++ b/cpp/benchmarks/text/normalize_spaces.cpp @@ -38,7 +38,7 @@ static void BM_normalize(benchmark::State& state) cudf::strings_column_view input(column->view()); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); nvtext::normalize_spaces(input); } diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp index 4d8df6ae37c..4695a62f1c0 100644 --- a/cpp/benchmarks/text/tokenize.cpp +++ b/cpp/benchmarks/text/tokenize.cpp @@ -44,7 +44,7 @@ static void BM_tokenize(benchmark::State& state, tokenize_type tt) cudf::test::strings_column_wrapper delimiters({" ", "+", "-"}); for (auto _ : state) { - cuda_event_timer raii(state, true, cudf::default_stream_value); + cuda_event_timer raii(state, true, cudf::get_default_stream()); switch (tt) { case tokenize_type::single: // single whitespace delimiter diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu index b1d2498f0e6..34b1e0254dd 100644 --- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu +++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu @@ -188,10 +188,10 @@ void type_dispatcher_benchmark(::benchmark::State& state) std::vector h_vec(n_cols); std::vector h_vec_p(n_cols); std::transform(h_vec.begin(), h_vec.end(), h_vec_p.begin(), [source_size](auto& col) { - col.resize(source_size * sizeof(TypeParam), cudf::default_stream_value); + col.resize(source_size * sizeof(TypeParam), cudf::get_default_stream()); return static_cast(col.data()); }); - rmm::device_uvector d_vec(n_cols, cudf::default_stream_value); + rmm::device_uvector d_vec(n_cols, cudf::get_default_stream()); if (dispatching_type == NO_DISPATCHING) { CUDF_CUDA_TRY(cudaMemcpy( diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake index 198435e739d..f79e4c37228 100644 --- a/cpp/cmake/Modules/ConfigureCUDA.cmake +++ b/cpp/cmake/Modules/ConfigureCUDA.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -19,10 +19,14 @@ endif() list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) # set warnings as errors -list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call) +if(CUDA_WARNINGS_AS_ERRORS) + list(APPEND CUDF_CUDA_FLAGS -Werror=all-warnings) +else() + list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call) +endif() list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations) -if(DISABLE_DEPRECATION_WARNING) +if(DISABLE_DEPRECATION_WARNINGS) list(APPEND CUDF_CXX_FLAGS -Wno-deprecated-declarations) list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations) endif() diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake index 8ce98c6d582..df285bdea55 100644 --- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -39,7 +39,8 @@ function(jit_preprocess_files) VERBATIM COMMAND ${CMAKE_COMMAND} -E make_directory "${jit_output_directory}" COMMAND - jitify_preprocess ${ARG_FILE} -o + "${CMAKE_COMMAND}" -E env LD_LIBRARY_PATH=${CUDAToolkit_LIBRARY_DIR} + $ ${ARG_FILE} -o ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17 -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS} diff --git a/cpp/cmake/config.json b/cpp/cmake/config.json index 4f287499503..f7d7b001856 100644 --- a/cpp/cmake/config.json +++ b/cpp/cmake/config.json @@ -9,7 +9,7 @@ "VERSION": "?", "GIT_SHALLOW": "?", "OPTIONS": "*", - "FIND_PACKAGE_ARGUMENTS": "*" + "FIND_PACKAGE_ARGUMENTS": "*" } }, "ConfigureTest": { diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 9fa5b9d1658..94dcdcb5bc2 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -20,43 +20,98 @@ # cmake-lint: disable=R0912,R0913,R0915 +include_guard(GLOBAL) + +# Generate a FindArrow module for the case where we need to search for arrow within a pip install +# pyarrow. +function(find_libarrow_in_python_wheel PYARROW_VERSION) + string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}") + set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER}) + + find_package(Python REQUIRED) + execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" + OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") + rapids_find_generate_module( + Arrow NO_CONFIG + VERSION "${PYARROW_VERSION}" + LIBRARY_NAMES "${PYARROW_LIB}" + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + HEADER_NAMES arrow/python/arrow_to_pandas.h + ) + + find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) + add_library(arrow_shared ALIAS Arrow::Arrow) + + # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's + # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note + # that these flags will often be redundant because we build wheels in manylinux containers that + # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent + # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer + # containers. Note that tests will not build successfully without also propagating these options + # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly + # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using + # this feature except for building wheels. + target_compile_options( + Arrow::Arrow INTERFACE "$<$:-D_GLIBCXX_USE_CXX11_ABI=0>" + "$<$:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>" + ) + + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) + + list(POP_BACK CMAKE_PREFIX_PATH) +endfunction() + # This function finds arrow and sets any additional necessary environment variables. function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON ENABLE_PARQUET ) + if(USE_LIBARROW_FROM_PYARROW) + # Generate a FindArrow.cmake to find pyarrow's libarrow.so + find_libarrow_in_python_wheel(${VERSION}) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + return() + endif() + if(BUILD_STATIC) if(TARGET arrow_static) - list(APPEND ARROW_LIBRARIES arrow_static) set(ARROW_FOUND TRUE PARENT_SCOPE ) set(ARROW_LIBRARIES - ${ARROW_LIBRARIES} + arrow_static PARENT_SCOPE ) return() endif() else() if(TARGET arrow_shared) - list(APPEND ARROW_LIBRARIES arrow_shared) set(ARROW_FOUND TRUE PARENT_SCOPE ) set(ARROW_LIBRARIES - ${ARROW_LIBRARIES} + arrow_shared PARENT_SCOPE ) return() endif() endif() - set(ARROW_BUILD_SHARED ON) - set(ARROW_BUILD_STATIC OFF) - set(CPMAddOrFindPackage CPMFindPackage) - if(NOT ARROW_ARMV8_ARCH) set(ARROW_ARMV8_ARCH "armv8-a") endif() @@ -69,8 +124,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_BUILD_STATIC ON) set(ARROW_BUILD_SHARED OFF) # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library - set(CPM_DOWNLOAD_ALL TRUE) + # library. + set(CPM_DOWNLOAD_Arrow TRUE) + else() + set(ARROW_BUILD_SHARED ON) + set(ARROW_BUILD_STATIC OFF) endif() set(ARROW_PYTHON_OPTIONS "") @@ -91,7 +149,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared + GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static + arrow_dataset_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -125,61 +184,65 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "xsimd_SOURCE AUTO" ) - set(ARROW_FOUND TRUE) - set(ARROW_LIBRARIES "") + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) - # Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR: set if CPM found Arrow on the - # system/conda/etc. - if(Arrow_ADDED OR Arrow_DIR) - if(BUILD_STATIC) - list(APPEND ARROW_LIBRARIES arrow_static) - else() - list(APPEND ARROW_LIBRARIES arrow_shared) - endif() + if(BUILD_STATIC) + set(ARROW_LIBRARIES arrow_static) + else() + set(ARROW_LIBRARIES arrow_shared) + endif() - if(Arrow_DIR) - find_package(Arrow REQUIRED QUIET) - if(ENABLE_PARQUET) - if(NOT Parquet_DIR) - # Set this to enable `find_package(Parquet)` - set(Parquet_DIR "${Arrow_DIR}") - endif() - # Set this to enable `find_package(ArrowDataset)` - set(ArrowDataset_DIR "${Arrow_DIR}") - find_package(ArrowDataset REQUIRED QUIET) + # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. + if(Arrow_DIR) + # This extra find_package is necessary because rapids_cpm_find does not propagate all the + # variables from find_package that we might need. This is especially problematic when + # rapids_cpm_find builds from source. + find_package(Arrow REQUIRED QUIET) + if(ENABLE_PARQUET) + # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. + if(NOT Parquet_DIR) + # Set this to enable `find_package(Parquet)` + set(Parquet_DIR "${Arrow_DIR}") endif() - elseif(Arrow_ADDED) - # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to - # target_include_directories. That defeats ccache. - file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" + # Set this to enable `find_package(ArrowDataset)` + set(ArrowDataset_DIR "${Arrow_DIR}") + find_package(ArrowDataset REQUIRED QUIET) + endif() + # Arrow_ADDED: set if CPM downloaded Arrow from Github + elseif(Arrow_ADDED) + # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to + # target_include_directories. That defeats ccache. + file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" + ) + if(ENABLE_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" ) - if(ENABLE_PARQUET) - file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" - ) - endif() - # - # This shouldn't be necessary! - # - # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` - # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. - # - # This only works because we know exactly which components we're using. Don't forget to update - # this list if we add more! - # - foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) - target_include_directories( - ${ARROW_LIBRARY} - INTERFACE "$" - "$" - "$" - "$" - ) - endforeach() endif() + # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` + # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. + # + # This only works because we know exactly which components we're using. Don't forget to update + # this list if we add more! + # + foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) + target_include_directories( + ${ARROW_LIBRARY} + INTERFACE "$" + "$" + "$" + "$" + ) + endforeach() else() - set(ARROW_FOUND FALSE) + set(ARROW_FOUND + FALSE + PARENT_SCOPE + ) message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.") endif() @@ -294,15 +357,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports) endif() - set(ARROW_FOUND - "${ARROW_FOUND}" - PARENT_SCOPE - ) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE ) - endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) diff --git a/cpp/cmake/thirdparty/get_dlpack.cmake b/cpp/cmake/thirdparty/get_dlpack.cmake index 252d50c7af8..65b5f4ff2eb 100644 --- a/cpp/cmake/thirdparty/get_dlpack.cmake +++ b/cpp/cmake/thirdparty/get_dlpack.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -15,6 +15,7 @@ # This function finds dlpack and sets any additional necessary environment variables. function(find_and_configure_dlpack VERSION) + include(${rapids-cmake-dir}/find/generate_module.cmake) rapids_find_generate_module(DLPACK HEADER_NAMES dlpack.h) rapids_cpm_find( diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake index b7c90952c95..d98abdf8824 100644 --- a/cpp/cmake/thirdparty/get_jitify.cmake +++ b/cpp/cmake/thirdparty/get_jitify.cmake @@ -18,7 +18,7 @@ function(find_and_configure_jitify) rapids_cpm_find( jitify 2.0.0 - GIT_REPOSITORY https://github.com/NVIDIA/jitify.git + GIT_REPOSITORY https://github.com/rapidsai/jitify.git GIT_TAG jitify2 GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake index 379b1521bf0..25a4c9dd3ba 100644 --- a/cpp/cmake/thirdparty/get_thrust.cmake +++ b/cpp/cmake/thirdparty/get_thrust.cmake @@ -13,73 +13,34 @@ # ============================================================================= # This function finds thrust and sets any additional necessary environment variables. -function(find_and_configure_thrust VERSION) - # We only want to set `UPDATE_DISCONNECTED` while the GIT tag hasn't moved from the last time we - # cloned - set(cpm_thrust_disconnect_update "UPDATE_DISCONNECTED TRUE") - set(CPM_THRUST_CURRENT_VERSION - ${VERSION} - CACHE STRING "version of thrust we checked out" - ) - if(NOT VERSION VERSION_EQUAL CPM_THRUST_CURRENT_VERSION) - set(CPM_THRUST_CURRENT_VERSION - ${VERSION} - CACHE STRING "version of thrust we checked out" FORCE - ) - set(cpm_thrust_disconnect_update "") - endif() +function(find_and_configure_thrust) - # We currently require cuDF to always build with a custom version of thrust. This is needed so - # that build times of of cudf are kept reasonable, without this CI builds of cudf will be killed - # as some source file can take over 45 minutes to build - # - set(CPM_DOWNLOAD_ALL TRUE) - rapids_cpm_find( - Thrust ${VERSION} - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - CPM_ARGS - GIT_REPOSITORY https://github.com/NVIDIA/thrust.git - GIT_TAG ${VERSION} - GIT_SHALLOW TRUE ${cpm_thrust_disconnect_update} - PATCH_COMMAND patch --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true - OPTIONS "THRUST_INSTALL TRUE" - ) + include(${rapids-cmake-dir}/cpm/thrust.cmake) + include(${rapids-cmake-dir}/cpm/package_override.cmake) - if(NOT TARGET cudf::Thrust) - thrust_create_target(cudf::Thrust FROM_OPTIONS) - endif() + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/thrust_override.json") - if(Thrust_SOURCE_DIR) # only install thrust when we have an in-source version - include(GNUInstallDirs) - install( - DIRECTORY "${Thrust_SOURCE_DIR}/thrust" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/" - FILES_MATCHING - REGEX "\\.(h|inl)$" - ) - install( - DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/" - FILES_MATCHING - PATTERN "*.cuh" - ) + # Make sure we install thrust into the `include/libcudf` subdirectory instead of the default + include(GNUInstallDirs) + set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcudf") + set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib") - install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/thrust/" - ) - install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/cub/" - ) + # Find or install Thrust with our custom set of patches + rapids_cpm_thrust( + NAMESPACE cudf + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + ) + if(Thrust_SOURCE_DIR) # Store where CMake can find our custom Thrust install include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( - INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust/]=] cudf-exports + INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/cmake/thrust]=] + cudf-exports ) endif() endfunction() -set(CUDF_MIN_VERSION_Thrust 1.17.2) - -find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust}) +find_and_configure_thrust() diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff new file mode 100644 index 00000000000..382f7dca1b0 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -0,0 +1,29 @@ +diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h +index d0e3f94..76774b0 100644 +--- a/thrust/system/cuda/detail/dispatch.h ++++ b/thrust/system/cuda/detail/dispatch.h +@@ -32,9 +32,8 @@ + status = call arguments; \ + } \ + else { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ +- } ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ ++ } + + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm +@@ -52,10 +51,8 @@ + status = call arguments; \ + } \ + else { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ +- } ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ ++ } + /** + * Dispatch between 32-bit and 64-bit index based versions of the same algorithm + * implementation. This version allows using different token sequences for callables diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff new file mode 100644 index 00000000000..6bf165805cc --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff @@ -0,0 +1,39 @@ +diff --git a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh +index b188c75f..3f36656f 100644 +--- a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh ++++ b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh +@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy + + + /// SM60 (GP100) +- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) +diff --git a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh +index e0470ccb..6a0c2ed6 100644 +--- a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh ++++ b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh +@@ -280,7 +280,7 @@ struct DeviceReducePolicy + }; + + /// SM60 +- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + typedef AgentReducePolicy< +diff --git a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh +index c2d04588..ac2d10e0 100644 +--- a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh ++++ b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh +@@ -177,7 +177,7 @@ struct DeviceScanPolicy + }; + + /// SM600 +- struct Policy600 : ChainedPolicy<600, Policy600, Policy520> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + typedef AgentScanPolicy< + 128, 15, ///< Threads per block, items per thread diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff new file mode 100644 index 00000000000..864c89d4504 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -0,0 +1,48 @@ +diff --git a/dependencies/cub/cub/block/block_merge_sort.cuh b/dependencies/cub/cub/block/block_merge_sort.cuh +index 4769df36..d86d6342 100644 +--- a/dependencies/cub/cub/block/block_merge_sort.cuh ++++ b/dependencies/cub/cub/block/block_merge_sort.cuh +@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, + KeyT key1 = keys_shared[keys1_beg]; + KeyT key2 = keys_shared[keys2_beg]; + +-#pragma unroll ++#pragma unroll 1 + for (int item = 0; item < ITEMS_PER_THREAD; ++item) + { + bool p = (keys2_beg < keys2_end) && +@@ -383,7 +383,7 @@ public: + // + KeyT max_key = oob_default; + +- #pragma unroll ++ #pragma unroll 1 + for (int item = 1; item < ITEMS_PER_THREAD; ++item) + { + if (ITEMS_PER_THREAD * linear_tid + item < valid_items) +@@ -407,7 +407,7 @@ public: + // each thread has sorted keys + // merge sort keys in shared memory + // +- #pragma unroll ++ #pragma unroll 1 + for (int target_merged_threads_number = 2; + target_merged_threads_number <= NUM_THREADS; + target_merged_threads_number *= 2) +diff --git a/dependencies/cub/cub/thread/thread_sort.cuh b/dependencies/cub/cub/thread/thread_sort.cuh +index 5d486789..b42fb5f0 100644 +--- a/dependencies/cub/cub/thread/thread_sort.cuh ++++ b/dependencies/cub/cub/thread/thread_sort.cuh +@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], + { + constexpr bool KEYS_ONLY = std::is_same::value; + +- #pragma unroll ++ #pragma unroll 1 + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { +- #pragma unroll ++ #pragma unroll 1 + for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) + { + if (compare_op(keys[j + 1], keys[j])) diff --git a/cpp/cmake/thirdparty/patches/thrust_override.json b/cpp/cmake/thirdparty/patches/thrust_override.json new file mode 100644 index 00000000000..f1908a64719 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_override.json @@ -0,0 +1,34 @@ + +{ + "packages" : { + "Thrust" : { + "patches" : [ + { + "file" : "Thrust/install_rules.diff", + "issue" : "Thrust 1.X installs incorrect files [https://github.com/NVIDIA/thrust/issues/1790]", + "fixed_in" : "2.0.0" + }, + { + "file" : "${current_json_dir}/thrust_transform_iter_with_reduce_by_key.diff", + "issue" : "Support transform_output_iterator as output of reduce by key [https://github.com/NVIDIA/thrust/pull/1805]", + "fixed_in" : "2.1" + }, + { + "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", + "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", + "fixed_in" : "" + }, + { + "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", + "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", + "fixed_in" : "" + }, + { + "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", + "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff b/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff new file mode 100644 index 00000000000..6a56af90d0d --- /dev/null +++ b/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff @@ -0,0 +1,26 @@ +diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h +index f512a36..a5f725d 100644 +--- a/thrust/iterator/transform_input_output_iterator.h ++++ b/thrust/iterator/transform_input_output_iterator.h +@@ -102,6 +102,8 @@ template + /*! \endcond + */ + ++ transform_input_output_iterator() = default; ++ + /*! This constructor takes as argument a \c Iterator an \c InputFunction and an + * \c OutputFunction and copies them to a new \p transform_input_output_iterator + * +diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h +index 66fb46a..4a68cb5 100644 +--- a/thrust/iterator/transform_output_iterator.h ++++ b/thrust/iterator/transform_output_iterator.h +@@ -104,6 +104,8 @@ template + /*! \endcond + */ + ++ transform_output_iterator() = default; ++ + /*! This constructor takes as argument an \c OutputIterator and an \c + * UnaryFunction and copies them to a new \p transform_output_iterator + * diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch deleted file mode 100644 index ae1962e4738..00000000000 --- a/cpp/cmake/thrust.patch +++ /dev/null @@ -1,116 +0,0 @@ -diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh -index 4769df36..d86d6342 100644 ---- a/cub/block/block_merge_sort.cuh -+++ b/cub/block/block_merge_sort.cuh -@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, - KeyT key1 = keys_shared[keys1_beg]; - KeyT key2 = keys_shared[keys2_beg]; - --#pragma unroll -+#pragma unroll 1 - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - bool p = (keys2_beg < keys2_end) && -@@ -383,7 +383,7 @@ public: - // - KeyT max_key = oob_default; - -- #pragma unroll -+ #pragma unroll 1 - for (int item = 1; item < ITEMS_PER_THREAD; ++item) - { - if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -@@ -407,7 +407,7 @@ public: - // each thread has sorted keys - // merge sort keys in shared memory - // -- #pragma unroll -+ #pragma unroll 1 - for (int target_merged_threads_number = 2; - target_merged_threads_number <= NUM_THREADS; - target_merged_threads_number *= 2) -diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh -index b188c75f..3f36656f 100644 ---- a/cub/device/dispatch/dispatch_radix_sort.cuh -+++ b/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy - - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) -diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh -index e0470ccb..6a0c2ed6 100644 ---- a/cub/device/dispatch/dispatch_reduce.cuh -+++ b/cub/device/dispatch/dispatch_reduce.cuh -@@ -280,7 +280,7 @@ struct DeviceReducePolicy - }; - - /// SM60 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) - typedef AgentReducePolicy< -diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh -index c2d04588..ac2d10e0 100644 ---- a/cub/device/dispatch/dispatch_scan.cuh -+++ b/cub/device/dispatch/dispatch_scan.cuh -@@ -177,7 +177,7 @@ struct DeviceScanPolicy - }; - - /// SM600 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy520> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - typedef AgentScanPolicy< - 128, 15, ///< Threads per block, items per thread -diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh -index 5d486789..b42fb5f0 100644 ---- a/cub/thread/thread_sort.cuh -+++ b/cub/thread/thread_sort.cuh -@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], - { - constexpr bool KEYS_ONLY = std::is_same::value; - -- #pragma unroll -+ #pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { -- #pragma unroll -+ #pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j])) -diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h -index d0e3f94..76774b0 100644 ---- a/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/system/cuda/detail/dispatch.h -@@ -32,9 +32,8 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -- } -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ -+ } - - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm -@@ -52,10 +51,8 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -- } -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ -+ } - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm - * implementation. This version allows using different token sequences for callables diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 871632b053d..4684e180f00 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 22.10.00 +PROJECT_NUMBER = 22.12.00 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -2162,7 +2162,7 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/22.10 +TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/22.12 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to diff --git a/cpp/doxygen/DoxygenLayout.xml b/cpp/doxygen/DoxygenLayout.xml index a78a1cb701f..ded88dfe531 100644 --- a/cpp/doxygen/DoxygenLayout.xml +++ b/cpp/doxygen/DoxygenLayout.xml @@ -12,29 +12,29 @@ - + - + - + - + - + diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index b3774aeda38..3c085984a0e 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -176,7 +176,7 @@ Resource ownership is an essential concept in libcudf. In short, an "owning" obj resource (such as device memory). It acquires that resource during construction and releases the resource in destruction ([RAII](https://en.cppreference.com/w/cpp/language/raii)). A "non-owning" object does not own resources. Any class in libcudf with the `*_view` suffix is non-owning. For more -detail see the [`libcudf++` presentation.](https://docs.google.com/presentation/d/1zKzAtc1AWFKfMhiUlV5yRZxSiPLwsObxMlWRWz_f5hA/edit?usp=sharing) +detail see the [`libcudf` presentation.](https://docs.google.com/presentation/d/1zKzAtc1AWFKfMhiUlV5yRZxSiPLwsObxMlWRWz_f5hA/edit?usp=sharing) libcudf functions typically take views as input (`column_view` or `table_view`) and produce `unique_ptr`s to owning objects as output. For example, @@ -346,7 +346,72 @@ the device view can be obtained via function `column_device_view::create(column_ data, a specialized device view for list columns can be constructed via `lists_column_device_view(column_device_view)`. -# libcudf++ API and Implementation +# libcudf Policies and Design Principles + +`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for solving a wide variety of problems that arise in data science. +APIs are written to execute on the default GPU, which can be controlled by the caller through standard CUDA device APIs or environment variables like `CUDA_VISIBLE_DEVICES`. +Our goal is to enable diverse use cases like Spark or Pandas to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark or Dask to orchestrate multi-GPU tasks. + +To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes may come at the cost of convenience. +While we welcome users to use libcudf directly, we design with the expectation that most users will be consuming libcudf through higher-level layers like Spark or cuDF Python that handle some of details that direct users of libcudf must handle on their own. +We document these policies and the reasons behind them here. + +## libcudf does not introspect data + +libcudf APIs generally do not perform deep introspection and validation of input data. +There are numerous reasons for this: +1. It violates the single responsibility principle: validation is separate from execution. +2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the overhead of a kernel launch, and may in general be prohibitively expensive. +3. API promises around data introspection often significantly complicate implementation. + +Users are therefore responsible for passing valid data into such APIs. +_Note that this policy does not mean that libcudf performs no validation whatsoever_. +libcudf APIs should still perform any validation that does not require introspection. +To give some idea of what should or should not be validated, here are (non-exhaustive) lists of examples. + +**Things that libcudf should validate**: +- Input column/table sizes or dtypes + +**Things that libcudf should not validate**: +- Integer overflow +- Ensuring that outputs will not exceed the 2GB size limit for a given set of inputs + + +## libcudf expects nested types to have sanitized null masks + +Various libcudf APIs accepting columns of nested dtypes (such as `LIST` or `STRUCT`) may assume that these columns have been sanitized. +In this context, sanitization refers to ensuring that the null elements in a column with a nested dtype are compatible with the elements of nested columns. +Specifically: +- Null elements of list columns should also be empty. The starting offset of a null element should be equal to the ending offset. +- Null elements of struct columns should also be null elements in the underlying structs. +- For compound columns, nulls should only be present at the level of the parent column. Child columns should not contain nulls. +- Slice operations on nested columns do not propagate offsets to child columns. + +libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized data. +Therefore, the only problem is if users construct input columns that are not correctly sanitized and then pass those into libcudf APIs. + +## Treat libcudf APIs as if they were asynchronous + +libcudf APIs called on the host do not guarantee that the stream is synchronized before returning. +Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default stream (stream 0). +Note that the stream 0 behavior differs if [per-thread default stream is enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via `CUDF_USE_PER_THREAD_DEFAULT_STREAM`. +Any data provided to or returned by libcudf that uses a separate non-blocking stream requires synchronization with the default libcudf stream to ensure stream safety. + +## libcudf generally does not make ordering guarantees + +Functions like merge or groupby in libcudf make no guarantees about the order of entries in the output. +Promising deterministic ordering is not, in general, conducive to fast parallel algorithms. +Calling code is responsible for performing sorts after the fact if sorted outputs are needed. + +## libcudf does not promise specific exception messages + +libcudf documents the exceptions that will be thrown by an API for different kinds of invalid inputs. +The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API. +However, the explanatory string returned by the `what` method of those exceptions is not part of the API and is subject to change. +Calling code should not rely on the contents of libcudf error messages to determine the nature of the error. +For information on the types of exceptions that libcudf throws under different circumstances, see the [section on error handling](#errors). + +# libcudf API and Implementation ## Streams @@ -359,7 +424,7 @@ internal API in the `detail` namespace. The internal `detail` API has the same p public API, plus a `rmm::cuda_stream_view` parameter at the end with no default value. If the detail API also accepts a memory resource parameter, the stream parameter should be ideally placed just *before* the memory resource. The public API will call the detail API and provide -`cudf::default_stream_value`. The implementation should be wholly contained in the `detail` API +`cudf::get_default_stream()`. The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the stream parameter. In order to make the `detail` API callable from other libcudf functions, it should be exposed in a @@ -390,7 +455,7 @@ namespace detail{ void external_function(...){ CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function. - detail::external_function(..., cudf::default_stream_value); + detail::external_function(..., cudf::get_default_stream()); } ``` @@ -780,7 +845,7 @@ description of what has broken from the past release. Label pull requests that c with the "non-breaking" tag. -# Error Handling +# Error Handling {#errors} libcudf follows conventions (and provides utilities) enforcing compile-time and run-time conditions and detecting and handling CUDA errors. Communication of errors is always via C++ diff --git a/cpp/doxygen/developer_guide/DOCUMENTATION.md b/cpp/doxygen/developer_guide/DOCUMENTATION.md index 8a7d89c8dbd..b86f7db82b0 100644 --- a/cpp/doxygen/developer_guide/DOCUMENTATION.md +++ b/cpp/doxygen/developer_guide/DOCUMENTATION.md @@ -1,4 +1,4 @@ -# libcudf++ C++ Documentation Guide +# libcudf C++ Documentation Guide These guidelines apply to documenting all libcudf C++ source files using doxygen style formatting although only public APIs and classes are actually [published](https://docs.rapids.ai/api/libcudf/stable/index.html). @@ -224,7 +224,7 @@ Also, \@copydoc is useful when documenting a `detail` function that differs only */ std::vector segmented_count_set_bits(bitmask_type const* bitmask, std::vector const& indices, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it. diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md index 31747e31ccb..198590bb35c 100644 --- a/cpp/doxygen/developer_guide/TESTING.md +++ b/cpp/doxygen/developer_guide/TESTING.md @@ -6,6 +6,13 @@ Unit tests in libcudf are written using **Important:** Instead of including `gtest/gtest.h` directly, use `#include `. +Also, write test code in the global namespace. That is, +do not write test code in the `cudf` or the `cudf::test` namespace or their +sub-namespaces. +Likewise, do not use `using namespace cudf;` or `using namespace cudf::test;` +in the global namespace. + + ## Best Practices: What Should We Test? In general we should test to make sure all code paths are covered. This is not always easy or @@ -38,8 +45,8 @@ groupby). Here are some other guidelines. does happen); columns with zero size but that somehow have non-null data pointers; and struct columns with no children. - * Decimal types are not included in the `NumericTypes` type list, but are included in - `FixedWidthTypes`, so be careful that tests either include or exclude decimal types as + * Decimal types are not included in the `cudf::test::NumericTypes` type list, but are included in + `cudf::test::FixedWidthTypes`, so be careful that tests either include or exclude decimal types as appropriate. @@ -99,8 +106,8 @@ list defined in `TestTypes` (`int, float, double`). The list of types that are used in tests should be consistent across all tests. To ensure consistency, several sets of common type lists are provided in -`include/cudf_test/type_lists.hpp`. For example, `NumericTypes` is a type list of all numeric types, -`FixedWidthTypes` is a list of all fixed-width element types, and `AllTypes` is a list of every +`include/cudf_test/type_lists.hpp`. For example, `cudf::test::NumericTypes` is a type list of all numeric types, +`FixedWidthTypes` is a list of all fixed-width element types, and `cudf::test::AllTypes` is a list of every element type that libcudf supports. ```c++ @@ -126,9 +133,8 @@ the `N`th type within the nested list, use `GetType`. Imagine testing all possible two-type combinations of ``. This could be done manually: ```c++ -using namespace cudf::test; template -TwoTypesFixture : BaseFixture{...}; +TwoTypesFixture : cudf::test::BaseFixture{...}; using TwoTypesList = Types< Types, Types, Types, Types >; TYPED_TEST_SUITE(TwoTypesFixture, TwoTypesList); @@ -178,9 +184,9 @@ transparently passed to any API expecting a `column_view` or `mutable_column_vie #### fixed_width_column_wrapper -The `fixed_width_column_wrapper` class should be used for constructing and initializing columns of +The `cudf::test::fixed_width_column_wrapper` class should be used for constructing and initializing columns of any fixed-width element type, e.g., numeric types, timestamp types, Boolean, etc. -`fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each +`cudf::test::fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each element in the column. For nullable columns, an additional iterator can be provided to indicate the validity of each element. There are also constructors that accept a `std::initializer_list` for the column elements and optionally for the validity of each element. @@ -189,25 +195,25 @@ Example: ```c++ // Creates a non-nullable column of INT32 elements with 5 elements: {0, 1, 2, 3, 4} -auto elements = make_counting_transform_iterator(0, [](auto i){return i;}); -fixed_width_column_wrapper w(elements, elements + 5); +auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i;}); +cudf::test::fixed_width_column_wrapper w(elements, elements + 5); // Creates a nullable column of INT32 elements with 5 elements: {null, 1, null, 3, null} -auto elements = make_counting_transform_iterator(0, [](auto i){return i;}); -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}) -fixed_width_column_wrapper w(elements, elements + 5, validity); +auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i;}); +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}) +cudf::test::fixed_width_column_wrapper w(elements, elements + 5, validity); // Creates a non-nullable INT32 column with 4 elements: {1, 2, 3, 4} -fixed_width_column_wrapper w{{1, 2, 3, 4}}; +cudf::test::fixed_width_column_wrapper w{{1, 2, 3, 4}}; // Creates a nullable INT32 column with 4 elements: {1, NULL, 3, NULL} -fixed_width_column_wrapper w{ {1,2,3,4}, {1, 0, 1, 0}}; +cudf::test::fixed_width_column_wrapper w{ {1,2,3,4}, {1, 0, 1, 0}}; ``` #### fixed_point_column_wrapper -The `fixed_point_column_wrapper` class should be used for constructing and initializing columns of -any fixed-point element type (DECIMAL32 or DECIMAL64). `fixed_point_column_wrapper` provides +The `cudf::test::fixed_point_column_wrapper` class should be used for constructing and initializing columns of +any fixed-point element type (DECIMAL32 or DECIMAL64). `cudf::test::fixed_point_column_wrapper` provides constructors that accept an iterator range to generate each element in the column. For nullable columns, an additional iterator can be provided to indicate the validity of each element. Constructors also take the scale of the fixed-point values to create. @@ -215,20 +221,20 @@ Constructors also take the scale of the fixed-point values to create. Example: ```c++ - // Creates a non-nullable column of 4 DECIMAL32 elements of scale 3: {1000, 2000, 3000, 4000} - auto elements = make_counting_transform_iterator(0, [](auto i){ return i; }); - fixed_point_column_wrapper w(elements, elements + 4, 3); - - // Creates a nullable column of 5 DECIMAL32 elements of scale 2: {null, 100, null, 300, null} - auto elements = make_counting_transform_iterator(0, [](auto i){ return i; }); - auto validity = make_counting_transform_iterator(0, [](auto i){ return i % 2; }); - fixed_point_column_wrapper w(elements, elements + 5, validity, 2); +// Creates a non-nullable column of 4 DECIMAL32 elements of scale 3: {1000, 2000, 3000, 4000} +auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i; }); +cudf::test::fixed_point_column_wrapper w(elements, elements + 4, 3); + +// Creates a nullable column of 5 DECIMAL32 elements of scale 2: {null, 100, null, 300, null} +auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i; }); +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i % 2; }); +cudf::test::fixed_point_column_wrapper w(elements, elements + 5, validity, 2); ``` #### dictionary_column_wrapper -The `dictionary_column_wrapper` class should be used to create dictionary columns. -`dictionary_column_wrapper` provides constructors that accept an iterator range to generate each +The `cudf::test::dictionary_column_wrapper` class should be used to create dictionary columns. +`cudf::test::dictionary_column_wrapper` provides constructors that accept an iterator range to generate each element in the column. For nullable columns, an additional iterator can be provided to indicate the validity of each element. There are also constructors that accept a `std::initializer_list` for the column elements and optionally for the validity of each element. @@ -239,43 +245,43 @@ Example: // Creates a non-nullable dictionary column of INT32 elements with 5 elements // keys = {0, 2, 6}, indices = {0, 1, 1, 2, 2} std::vector elements{0, 2, 2, 6, 6}; -dictionary_column_wrapper w(element.begin(), elements.end()); +cudf::test::dictionary_column_wrapper w(element.begin(), elements.end()); // Creates a nullable dictionary column with 5 elements and a validity iterator. std::vector elements{0, 2, 0, 6, 0}; // Validity iterator here sets even rows to null. -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}) +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}) // keys = {2, 6}, indices = {NULL, 0, NULL, 1, NULL} -dictionary_column_wrapper w(elements, elements + 5, validity); +cudf::test::dictionary_column_wrapper w(elements, elements + 5, validity); // Creates a non-nullable dictionary column with 4 elements. // keys = {1, 2, 3}, indices = {0, 1, 2, 0} -dictionary_column_wrapper w{{1, 2, 3, 1}}; +cudf::test::dictionary_column_wrapper w{{1, 2, 3, 1}}; // Creates a nullable dictionary column with 4 elements and validity initializer. // keys = {1, 3}, indices = {0, NULL, 1, NULL} -dictionary_column_wrapper w{ {1, 0, 3, 0}, {1, 0, 1, 0}}; +cudf::test::dictionary_column_wrapper w{ {1, 0, 3, 0}, {1, 0, 1, 0}}; // Creates a nullable column of dictionary elements with 5 elements and validity initializer. std::vector elements{0, 2, 2, 6, 6}; // keys = {2, 6}, indices = {NULL, 0, NULL, 1, NULL} -dictionary_width_column_wrapper w(elements, elements + 5, {0, 1, 0, 1, 0}); +cudf::test::dictionary_width_column_wrapper w(elements, elements + 5, {0, 1, 0, 1, 0}); // Creates a non-nullable dictionary column with 7 string elements std::vector strings{"", "aaa", "bbb", "aaa", "bbb", "ccc", "bbb"}; // keys = {"","aaa","bbb","ccc"}, indices = {0, 1, 2, 1, 2, 3, 2} -dictionary_column_wrapper d(strings.begin(), strings.end()); +cudf::test::dictionary_column_wrapper d(strings.begin(), strings.end()); // Creates a nullable dictionary column with 7 string elements and a validity iterator. // Validity iterator here sets even rows to null. // keys = {"a", "bb"}, indices = {NULL, 1, NULL, 1, NULL, 0, NULL} -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}); -dictionary_column_wrapper d({"", "bb", "", "bb", "", "a", ""}, validity); +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}); +cudf::test::dictionary_column_wrapper d({"", "bb", "", "bb", "", "a", ""}, validity); ``` #### strings_column_wrapper -The `strings_column_wrapper` class should be used to create columns of strings. It provides +The `cudf::test::strings_column_wrapper` class should be used to create columns of strings. It provides constructors that accept an iterator range to generate each string in the column. For nullable columns, an additional iterator can be provided to indicate the validity of each string. There are also constructors that accept a `std::initializer_list` for the column's strings and @@ -287,27 +293,27 @@ Example: // Creates a non-nullable STRING column with 7 string elements: // {"", "this", "is", "a", "column", "of", "strings"} std::vector strings{"", "this", "is", "a", "column", "of", "strings"}; -strings_column_wrapper s(strings.begin(), strings.end()); +cudf::test::strings_column_wrapper s(strings.begin(), strings.end()); // Creates a nullable STRING column with 7 string elements: // {NULL, "this", NULL, "a", NULL, "of", NULL} std::vector strings{"", "this", "is", "a", "column", "of", "strings"}; -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}); -strings_column_wrapper s(strings.begin(), strings.end(), validity); +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}); +cudf::test::strings_column_wrapper s(strings.begin(), strings.end(), validity); // Creates a non-nullable STRING column with 7 string elements: // {"", "this", "is", "a", "column", "of", "strings"} -strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}); +cudf::test::strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}); // Creates a nullable STRING column with 7 string elements: // {NULL, "this", NULL, "a", NULL, "of", NULL} -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}); -strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, validity); +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}); +cudf::test::strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, validity); ``` #### lists_column_wrapper -The `lists_column_wrapper` class should be used to create columns of lists. It provides +The `cudf::test::lists_column_wrapper` class should be used to create columns of lists. It provides constructors that accept an iterator range to generate each list in the column. For nullable columns, an additional iterator can be provided to indicate the validity of each list. There are also constructors that accept a `std::initializer_list` for the column's lists and @@ -318,50 +324,50 @@ Example: ```c++ // Creates an empty LIST column // [] -lists_column_wrapper l{}; +cudf::test::lists_column_wrapper l{}; // Creates a LIST column with 1 list composed of 2 total integers // [{0, 1}] -lists_column_wrapper l{0, 1}; +cudf::test::lists_column_wrapper l{0, 1}; // Creates a LIST column with 3 lists // [{0, 1}, {2, 3}, {4, 5}] -lists_column_wrapper l{ {0, 1}, {2, 3}, {4, 5} }; +cudf::test::lists_column_wrapper l{ {0, 1}, {2, 3}, {4, 5} }; // Creates a LIST of LIST columns with 2 lists on the top level and // 4 below // [ {{0, 1}, {2, 3}}, {{4, 5}, {6, 7}} ] -lists_column_wrapper l{ {{0, 1}, {2, 3}}, {{4, 5}, {6, 7}} }; +cudf::test::lists_column_wrapper l{ {{0, 1}, {2, 3}}, {{4, 5}, {6, 7}} }; // Creates a LIST column with 1 list composed of 5 total integers // [{0, 1, 2, 3, 4}] -auto elements = make_counting_transform_iterator(0, [](auto i){return i*2;}); -lists_column_wrapper l(elements, elements+5); +auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i*2;}); +cudf::test::lists_column_wrapper l(elements, elements+5); // Creates a LIST column with 1 lists composed of 2 total integers // [{0, NULL}] -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}); -lists_column_wrapper l{{0, 1}, validity}; +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}); +cudf::test::lists_column_wrapper l{{0, 1}, validity}; // Creates a LIST column with 1 lists composed of 5 total integers // [{0, NULL, 2, NULL, 4}] -auto elements = make_counting_transform_iterator(0, [](auto i){return i*2;}); -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}); -lists_column_wrapper l(elements, elements+5, validity); +auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i*2;}); +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}); +cudf::test::lists_column_wrapper l(elements, elements+5, validity); // Creates a LIST column with 1 list composed of 2 total strings // [{"abc", "def"}] -lists_column_wrapper l{"abc", "def"}; +cudf::test::lists_column_wrapper l{"abc", "def"}; // Creates a LIST of LIST columns with 2 lists on the top level and 4 below // [ {{0, 1}, NULL}, {{4, 5}, NULL} ] -auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;}); -lists_column_wrapper l{ {{{0, 1}, {2, 3}}, validity}, {{{4, 5}, {6, 7}}, validity} }; +auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;}); +cudf::test::lists_column_wrapper l{ {{{0, 1}, {2, 3}}, validity}, {{{4, 5}, {6, 7}}, validity} }; ``` #### structs_column_wrapper -The `structs_column_wrapper` class should be used to create columns of structs. It provides +The `cudf::test::structs_column_wrapper` class should be used to create columns of structs. It provides constructors that accept a vector or initializer list of pre-constructed columns or column wrappers for child columns. For nullable columns, an additional iterator can be provided to indicate the validity of each struct. @@ -370,41 +376,41 @@ Examples: ```c++ // The following constructs a column for struct< int, string >. -auto child_int_col = fixed_width_column_wrapper{ 1, 2, 3, 4, 5 }.release(); -auto child_string_col = string_column_wrapper {"All", "the", "leaves", "are", "brown"}.release(); +auto child_int_col = cudf::test::fixed_width_column_wrapper{ 1, 2, 3, 4, 5 }.release(); +auto child_string_col = cudf::test::string_column_wrapper {"All", "the", "leaves", "are", "brown"}.release(); -std::vector> child_columns; +std::vector> child_columns; child_columns.push_back(std::move(child_int_col)); child_columns.push_back(std::move(child_string_col)); -struct_column_wrapper struct_column_wrapper{ +cudf::test::struct_col wrapper wrapper{ child_cols, {1,0,1,0,1} // Validity }; -auto struct_col {struct_column_wrapper.release()}; +auto struct_col {wrapper.release()}; // The following constructs a column for struct< int, string >. -fixed_width_column_wrapper child_int_col_wrapper{ 1, 2, 3, 4, 5 }; -string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"}; +cudf::test::fixed_width_column_wrapper child_int_col_wrapper{ 1, 2, 3, 4, 5 }; +cudf::test::string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"}; -struct_column_wrapper struct_column_wrapper{ +cudf::test::struct_column_wrapper wrapper{ {child_int_col_wrapper, child_string_col_wrapper} {1,0,1,0,1} // Validity }; -auto struct_col {struct_column_wrapper.release()}; +auto struct_col {wrapper.release()}; // The following constructs a column for struct< int, string >. -fixed_width_column_wrapper child_int_col_wrapper{ 1, 2, 3, 4, 5 }; -string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"}; +cudf::test::fixed_width_column_wrapper child_int_col_wrapper{ 1, 2, 3, 4, 5 }; +cudf::test::string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"}; -struct_column_wrapper struct_column_wrapper{ +cudf::test::struct_column_wrapper wrapper{ {child_int_col_wrapper, child_string_col_wrapper} cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i % 2; }) // Validity }; -auto struct_col {struct_column_wrapper.release()}; +auto struct_col {wrapper.release()}; ``` ### Column Comparison Utilities diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md index 85b7888b066..308d10601af 100644 --- a/cpp/doxygen/main_page.md +++ b/cpp/doxygen/main_page.md @@ -1,5 +1,5 @@ -# libcudf +# libcudf -libcudf is a C++ GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise +libcudf is a C++ GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating data. A GPU DataFrame is a column-oriented tabular data structure, so libcudf provides two core data structures: cudf::column, and cudf::table. diff --git a/cpp/doxygen/unicode.md b/cpp/doxygen/unicode.md index d20a18ba34c..1ab09e110c1 100644 --- a/cpp/doxygen/unicode.md +++ b/cpp/doxygen/unicode.md @@ -2,7 +2,7 @@ The strings column currently supports only UTF-8 characters internally. For functions that require character testing (e.g. cudf::strings::all_characters_of_type()) or -case conversion (e.g. cudf::strings::capitalize(), etc) only the 16-bit [Unicode 13.0](http://www.unicode.org/versions/Unicode13.0.0) +case conversion (e.g. cudf::strings::capitalize(), etc) only the 16-bit [Unicode 13.0](http://www.unicode.org/versions/Unicode13.0.0) character code-points (0-65535) values are supported. Case conversion and character testing on characters above code-point 65535 are not supported. diff --git a/cpp/examples/README.md b/cpp/examples/README.md index 30b291d38f4..b2e8dd399d0 100644 --- a/cpp/examples/README.md +++ b/cpp/examples/README.md @@ -5,4 +5,5 @@ libcudf examples. Current examples: -- Basic: example that demonstrates basic use case with libcudf and building a custom application with libcudf. +- Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf +- Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index f4bc205d4ba..7e7c6b191b5 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -cmake_minimum_required(VERSION 3.18) +cmake_minimum_required(VERSION 3.23.1) project( basic_example @@ -16,7 +16,7 @@ file( ) include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) -set(CUDF_TAG branch-22.10) +set(CUDF_TAG branch-22.12) CPMFindPackage( NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf GIT_TAG ${CUDF_TAG} diff --git a/cpp/examples/basic/README.md b/cpp/examples/basic/README.md index 75f16e54033..471dcf6694f 100644 --- a/cpp/examples/basic/README.md +++ b/cpp/examples/basic/README.md @@ -15,7 +15,7 @@ cmake -S . -B build/ # Build cmake --build build/ --parallel $PARALLEL_LEVEL # Execute -build/libcudf_example +build/basic_example ``` If your machine does not come with a pre-built libcudf binary, expect the diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp index 5a3914da453..edd14d9ee5f 100644 --- a/cpp/examples/basic/src/process_csv.cpp +++ b/cpp/examples/basic/src/process_csv.cpp @@ -19,6 +19,10 @@ #include #include +#include +#include +#include + #include #include #include @@ -72,6 +76,21 @@ std::unique_ptr average_closing_price(cudf::table_view stock_info_t int main(int argc, char** argv) { + // Construct a CUDA memory resource using RAPIDS Memory Manager (RMM) + // This is the default memory resource for libcudf for allocating device memory. + rmm::mr::cuda_memory_resource cuda_mr{}; + // Construct a memory pool using the CUDA memory resource + // Using a memory pool for device memory allocations is important for good performance in libcudf. + // The pool defaults to allocating half of the available GPU memory. + rmm::mr::pool_memory_resource mr{&cuda_mr}; + + // Set the pool resource to be used by default for all device memory allocations + // Note: It is the user's responsibility to ensure the `mr` object stays alive for the duration of + // it being set as the default + // Also, call this before the first libcudf API call to ensure all data is allocated by the same + // memory resource. + rmm::mr::set_current_device_resource(&mr); + // Read data auto stock_table_with_metadata = read_csv("4stock_5day.csv"); diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 079f7358872..7d389cd318d 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -17,8 +17,15 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")} # Basic example BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build - # Configure cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" # Build cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} + +# Strings example +STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings +STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build +# Configure +cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}" +# Build +cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL} diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt new file mode 100644 index 00000000000..1a16b2bc8fd --- /dev/null +++ b/cpp/examples/strings/CMakeLists.txt @@ -0,0 +1,49 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +cmake_minimum_required(VERSION 3.23.1) + +project( + strings_examples + VERSION 0.0.1 + LANGUAGES CXX CUDA +) + +set(CPM_DOWNLOAD_VERSION v0.35.3) +file( + DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake + ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake +) +include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) + +set(CUDF_TAG branch-22.12) +CPMFindPackage( + NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf + GIT_TAG ${CUDF_TAG} + GIT_SHALLOW + TRUE + SOURCE_SUBDIR + cpp +) + +list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) + +# +add_executable(libcudf_apis libcudf_apis.cpp) +target_compile_features(libcudf_apis PRIVATE cxx_std_17) +target_link_libraries(libcudf_apis PRIVATE cudf::cudf nvToolsExt) + +add_executable(custom_with_malloc custom_with_malloc.cu) +target_compile_features(custom_with_malloc PRIVATE cxx_std_17) +target_compile_options(custom_with_malloc PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt) + +add_executable(custom_prealloc custom_prealloc.cu) +target_compile_features(custom_prealloc PRIVATE cxx_std_17) +target_compile_options(custom_prealloc PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_prealloc PRIVATE cudf::cudf nvToolsExt) + +add_executable(custom_optimized custom_optimized.cu) +target_compile_features(custom_optimized PRIVATE cxx_std_17) +target_compile_options(custom_optimized PRIVATE "$<$:${CUDF_CUDA_FLAGS}>") +target_link_libraries(custom_optimized PRIVATE cudf::cudf nvToolsExt) diff --git a/cpp/examples/strings/README.md b/cpp/examples/strings/README.md new file mode 100644 index 00000000000..241aa064bcc --- /dev/null +++ b/cpp/examples/strings/README.md @@ -0,0 +1,37 @@ +# libcudf C++ examples using strings columns + +This C++ example demonstrates using libcudf APIs to access and create +strings columns. + +The example source code loads a csv file and produces a redacted strings +column from the names column using the values from the visibilities column. + +Four examples are included: +1. Using libcudf APIs to build the output +2. Using a simple custom kernel with dynamic memory +3. Using a custom kernel with pre-allocated device memory +4. Using a two-pass approach to improve performance + +These examples are described in more detail in +https://developer.nvidia.com/blog/mastering-string-transformations-in-rapids-libcudf/ + +## Compile and execute + +```bash +# Configure project +cmake -S . -B build/ +# Build +cmake --build build/ --parallel $PARALLEL_LEVEL +# Execute +build/libcudf_apis names.csv +--OR-- +build/custom_with_malloc names.csv +--OR-- +build/custom_prealloc names.csv +--OR-- +build/custom_optimized names.csv +``` + +If your machine does not come with a pre-built libcudf binary, expect the +first build to take some time, as it would build libcudf on the host machine. +It may be sped up by configuring the proper `PARALLEL_LEVEL` number. diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp new file mode 100644 index 00000000000..dbd3c4dbd1b --- /dev/null +++ b/cpp/examples/strings/common.hpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +/** + * @brief Main example function returns redacted strings column. + * + * This function returns a redacted version of the input `names` column + * using the the `visibilities` column as in the following example + * ``` + * names visibility --> redacted + * John Doe public D John + * Bobby Joe private X X + * ``` + * + * @param names First and last names separated with a single space + * @param visibilities String values `public` or `private` only + * @return Redacted strings column + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities); + +/** + * @brief Create CUDA memory resource + */ +auto make_cuda_mr() { return std::make_shared(); } + +/** + * @brief Create a pool device memory resource + */ +auto make_pool_mr() +{ + return rmm::mr::make_owning_wrapper(make_cuda_mr()); +} + +/** + * @brief Create memory resource for libcudf functions + */ +std::shared_ptr create_memory_resource(std::string const& name) +{ + if (name == "pool") { return make_pool_mr(); } + return make_cuda_mr(); +} + +/** + * @brief Main for strings examples + * + * Command line parameters: + * 1. CSV file name/path + * 2. Memory resource (optional): 'pool' or 'cuda' + * + * The stdout includes the number of rows in the input and the output size in bytes. + */ +int main(int argc, char const** argv) +{ + if (argc < 2) { + std::cout << "required parameter: csv-file-path\n"; + return 1; + } + + auto const mr_name = std::string{argc > 2 ? std::string(argv[2]) : std::string("cuda")}; + auto resource = create_memory_resource(mr_name); + rmm::mr::set_current_device_resource(resource.get()); + + auto const csv_file = std::string{argv[1]}; + auto const csv_result = [csv_file] { + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_file}).header(-1); + return cudf::io::read_csv(in_opts).tbl; + }(); + auto const csv_table = csv_result->view(); + + std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns() + << " columns\n"; + + auto st = std::chrono::steady_clock::now(); + auto result = redact_strings(csv_table.column(0), csv_table.column(1)); + + std::chrono::duration elapsed = std::chrono::steady_clock::now() - st; + std::cout << "Wall time: " << elapsed.count() << " seconds\n"; + std::cout << "Output size " << result->view().child(1).size() << " bytes\n"; + + return 0; +} diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu new file mode 100644 index 00000000000..bfe650daa93 --- /dev/null +++ b/cpp/examples/strings/custom_optimized.cu @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include + +#include +#include +#include + +#include + +#include +#include + +/** + * @brief Computes the size of each output row + * + * This thread is called once per row in d_names. + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param d_sizes Output sizes for each row + */ +__global__ void sizes_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::size_type* d_sizes) +{ + // The row index is resolved from the CUDA thread/block objects + auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + auto const redaction = cudf::string_view("X X", 3); + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + + cudf::size_type result = redaction.size_bytes(); // init to redaction size + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + + result = first.size_bytes() + last_initial.size_bytes() + 1; + } + + d_sizes[index] = result; +} + +/** + * @brief Builds the output for each row + * + * This thread is called once per row in d_names. + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param d_offsets Byte offset in `d_chars` for each row + * @param d_chars Output memory for all rows + */ +__global__ void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::size_type const* d_offsets, + char* d_chars) +{ + // The row index is resolved from the CUDA thread/block objects + auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + auto const redaction = cudf::string_view("X X", 3); + + // resolve output_ptr using the offsets vector + char* output_ptr = d_chars + d_offsets[index]; + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + auto const output_size = first.size_bytes() + last_initial.size_bytes() + 1; + + // build output string + memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); + output_ptr += last_initial.size_bytes(); + *output_ptr++ = ' '; + memcpy(output_ptr, first.data(), first.size_bytes()); + } else { + memcpy(output_ptr, redaction.data(), redaction.size_bytes()); + } +} + +/** + * @brief Redacts each name per the corresponding visibility entry + * + * This implementation builds the strings column children (offsets and chars) + * directly into device memory for libcudf. + * + * @param names Column of names + * @param visibilities Column of visibilities + * @return Redacted column of names + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + // all device memory operations and kernel functions will run on this stream + auto stream = rmm::cuda_stream_default; + + auto const d_names = cudf::column_device_view::create(names, stream); + auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); + + constexpr int block_size = 128; // this arbitrary size should be a power of 2 + int const blocks = (names.size() + block_size - 1) / block_size; + + nvtxRangePushA("redact_strings"); + + // create offsets vector + auto offsets = rmm::device_uvector(names.size() + 1, stream); + + // compute output sizes + sizes_kernel<<>>( + *d_names, *d_visibilities, offsets.data()); + + // convert sizes to offsets (in place) + thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin()); + + // last element is the total output size + // (device-to-host copy of 1 integer -- includes synching the stream) + cudf::size_type output_size = offsets.back_element(stream); + + // create chars vector + auto chars = rmm::device_uvector(output_size, stream); + + // build chars output + redact_kernel<<>>( + *d_names, *d_visibilities, offsets.data(), chars.data()); + + // create column from offsets and chars vectors (no copy is performed) + auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars)); + + // wait for all of the above to finish + stream.synchronize(); + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu new file mode 100644 index 00000000000..c0bae03af5c --- /dev/null +++ b/cpp/examples/strings/custom_prealloc.cu @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include + +#include + +#include +#include + +/** + * @brief Builds the output for each row + * + * This thread is called once per row in d_names. + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param redaction Redacted string replacement + * @param working_memory Output memory for all rows + * @param d_offsets Byte offset in `d_chars` for each row + * @param d_output Output array of string_view objects + */ +__global__ void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::string_view redaction, + char* working_memory, + cudf::offset_type const* d_offsets, + cudf::string_view* d_output) +{ + // The row index is resolved from the CUDA thread/block objects + auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + auto const output_size = first.size_bytes() + last_initial.size_bytes() + 1; + + char* output_ptr = working_memory + d_offsets[index]; + d_output[index] = cudf::string_view{output_ptr, output_size}; + + // build output string + memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); + output_ptr += last_initial.size_bytes(); + *output_ptr++ = ' '; + memcpy(output_ptr, first.data(), first.size_bytes()); + } else { + d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()}; + } +} + +/** + * @brief Redacts each name per the corresponding visibility entry + * + * This implementation builds the individual strings into a fixed memory buffer + * and then calls a factory function to gather them into a strings column. + * + * @param names Column of names + * @param visibilities Column of visibilities + * @return Redacted column of names + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + // all device memory operations and kernel functions will run on this stream + auto stream = rmm::cuda_stream_default; + + auto const d_names = cudf::column_device_view::create(names, stream); + auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); + auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream); + + constexpr int block_size = 128; // this arbitrary size should be a power of 2 + auto const blocks = (names.size() + block_size - 1) / block_size; + + nvtxRangePushA("redact_strings"); + + auto const scv = cudf::strings_column_view(names); + auto const offsets = scv.offsets_begin(); + + // create working memory to hold the output of each string + auto working_memory = rmm::device_uvector(scv.chars_size(), stream); + // create a vector for the output strings' pointers + auto str_ptrs = rmm::device_uvector(names.size(), stream); + + // build the output strings + redact_kernel<<>>(*d_names, + *d_visibilities, + d_redaction.value(), + working_memory.data(), + offsets, + str_ptrs.data()); + + // create strings column from the string_pairs; + // this copies all the individual strings into a single output column + auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream); + // temporary memory cleanup cost here for str_ptrs and working_memory + + // wait for all of the above to finish + stream.synchronize(); + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu new file mode 100644 index 00000000000..f1d397ef007 --- /dev/null +++ b/cpp/examples/strings/custom_with_malloc.cu @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include + +#include + +#include +#include + +/** + * @brief Reserve CUDA malloc heap size + * + * Call this function to change the CUDA malloc heap size limit. + * This value depends on the total size of all the malloc() + * calls needed for redact_kernel. + * + * @param heap_size Number of bytes to reserve + * Default is 1GB + */ +void set_malloc_heap_size(size_t heap_size = 1073741824) // 1GB +{ + size_t max_malloc_heap_size = 0; + cudaDeviceGetLimit(&max_malloc_heap_size, cudaLimitMallocHeapSize); + if (max_malloc_heap_size < heap_size) { + max_malloc_heap_size = heap_size; + if (cudaDeviceSetLimit(cudaLimitMallocHeapSize, max_malloc_heap_size) != cudaSuccess) { + fprintf(stderr, "could not set malloc heap size to %ldMB\n", (heap_size / (1024 * 1024))); + throw std::runtime_error(""); + } + } +} + +/** + * @brief Builds the output for each row + * + * This thread is called once per row in d_names. + * + * Note: This uses malloc() in a device kernel which works great + * but is not very efficient. This can be useful for prototyping + * on functions where performance is not yet important. + * All calls to malloc() must have a corresponding free() call. + * The separate free_kernel is launched for this purpose. + * + * @param d_names Column of names + * @param d_visibilities Column of visibilities + * @param redaction Redacted string replacement + * @param d_output Output array of string_view objects + */ +__global__ void redact_kernel(cudf::column_device_view const d_names, + cudf::column_device_view const d_visibilities, + cudf::string_view redaction, + cudf::string_view* d_output) +{ + // The row index is resolved from the CUDA thread/block objects + auto index = threadIdx.x + blockIdx.x * blockDim.x; + // There may be more threads than actual rows + if (index >= d_names.size()) return; + + auto const visible = cudf::string_view("public", 6); + + auto const name = d_names.element(index); + auto const vis = d_visibilities.element(index); + if (vis == visible) { + auto const space_idx = name.find(' '); + auto const first = name.substr(0, space_idx); + auto const last_initial = name.substr(space_idx + 1, 1); + auto const output_size = first.size_bytes() + last_initial.size_bytes() + 1; + + char* output_ptr = static_cast(malloc(output_size)); + d_output[index] = cudf::string_view{output_ptr, output_size}; + + // build output string + memcpy(output_ptr, last_initial.data(), last_initial.size_bytes()); + output_ptr += last_initial.size_bytes(); + *output_ptr++ = ' '; + memcpy(output_ptr, first.data(), first.size_bytes()); + } else { + d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()}; + } +} + +/** + * @brief Frees the temporary individual string objects created in the + * redact_kernel + * + * Like malloc(), free() is not very efficient but must be called for + * each malloc() to return the memory to the CUDA malloc heap. + * + * @param redaction Redacted string replacement (not to be freed) + * @param d_output Output array of string_view objects to free + */ +__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count) +{ + auto index = threadIdx.x + blockIdx.x * blockDim.x; + if (index >= count) return; + + auto ptr = const_cast(d_output[index].data()); + if (ptr != redaction.data()) { free(ptr); } +} + +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + // all device memory operations and kernel functions will run on this stream + auto stream = rmm::cuda_stream_default; + + set_malloc_heap_size(); // to illustrate adjusting the malloc heap + + auto const d_names = cudf::column_device_view::create(names, stream); + auto const d_visibilities = cudf::column_device_view::create(visibilities, stream); + auto const d_redaction = cudf::string_scalar(std::string("X X"), true, stream); + + constexpr int block_size = 128; // this arbitrary size should be a power of 2 + auto const blocks = (names.size() + block_size - 1) / block_size; + + nvtxRangePushA("redact_strings"); + + // create a vector for the output strings' pointers + auto str_ptrs = new rmm::device_uvector(names.size(), stream); + + auto result = [&] { + // build the output strings + redact_kernel<<>>( + *d_names, *d_visibilities, d_redaction.value(), str_ptrs->data()); + // create strings column from the string_view vector + // this copies all the individual strings into a single output column + return cudf::make_strings_column(*str_ptrs, cudf::string_view{nullptr, 0}, stream); + }(); + + // free the individual temporary memory pointers + free_kernel<<>>( + d_redaction.value(), str_ptrs->data(), names.size()); + delete str_ptrs; + + // wait for all of the above to finish + stream.synchronize(); + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/libcudf_apis.cpp b/cpp/examples/strings/libcudf_apis.cpp new file mode 100644 index 00000000000..009e92d8a0d --- /dev/null +++ b/cpp/examples/strings/libcudf_apis.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "common.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** + * @brief Redacts each name per the corresponding visibility entry + * + * This implementation uses libcudf APIs to create the output result. + * + * @param names Column of names + * @param visibilities Column of visibilities + * @return Redacted column of names + */ +std::unique_ptr redact_strings(cudf::column_view const& names, + cudf::column_view const& visibilities) +{ + auto const visible = cudf::string_scalar(std::string("public")); + auto const redaction = cudf::string_scalar(std::string("X X")); + + nvtxRangePushA("redact_strings"); + + auto const allowed = cudf::strings::contains(visibilities, visible); + auto const redacted = cudf::copy_if_else(names, redaction, allowed->view()); + auto const first_last = cudf::strings::split(redacted->view()); + auto const first = first_last->view().column(0); + auto const last = first_last->view().column(1); + auto const last_initial = cudf::strings::slice_strings(last, 0, 1); + + auto const last_initial_first = cudf::table_view({last_initial->view(), first}); + + auto result = cudf::strings::concatenate(last_initial_first, std::string(" ")); + + cudaStreamSynchronize(0); + + nvtxRangePop(); + return result; +} diff --git a/cpp/examples/strings/names.csv b/cpp/examples/strings/names.csv new file mode 100644 index 00000000000..77dca3e02af --- /dev/null +++ b/cpp/examples/strings/names.csv @@ -0,0 +1,20 @@ +John Doe,public +Jane Doe,private +Billy Joe,private +James James,public +Michael Frederick,public +Christopher Cheryl,public +Jessica Autumn,public +Matthew Tyrone,public +Ashley Martha,public +Jennifer Omar,public +Joshua Lydia,public +Amanda Jerome,public +Daniel Theodore,public +David Abby,public +James Neil,public +Robert Shawna,private +John Sierra,private +Joseph Nina,private +Andrew Tammy,private +Ryan Nikki,public diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index a26a0c7947b..d319041f8b1 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -515,9 +515,10 @@ std::unique_ptr make_collect_list_aggregation( * @return A COLLECT_SET aggregation object */ template -std::unique_ptr make_collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, - null_equality nulls_equal = null_equality::EQUAL, - nan_equality nans_equal = nan_equality::UNEQUAL); +std::unique_ptr make_collect_set_aggregation( + null_policy null_handling = null_policy::INCLUDE, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL); /** * @brief Factory to create a LAG aggregation @@ -588,8 +589,9 @@ std::unique_ptr make_merge_lists_aggregation(); * @return A MERGE_SETS aggregation object */ template -std::unique_ptr make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL, - nan_equality nans_equal = nan_equality::UNEQUAL); +std::unique_ptr make_merge_sets_aggregation( + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL); /** * @brief Factory to create a MERGE_M2 aggregation diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index c82fd1b52a1..fabe0d86fc4 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -232,7 +232,7 @@ namespace binops { std::pair scalar_col_valid_mask_and( column_view const& col, scalar const& s, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); namespace compiled { @@ -255,7 +255,7 @@ void apply_sorting_struct_binary_op(mutable_column_view& out, bool is_lhs_scalar, bool is_rhs_scalar, binary_operator op, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); } // namespace detail } // namespace compiled } // namespace binops diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp index c5f6d339ae9..c02991051d9 100644 --- a/cpp/include/cudf/column/column.hpp +++ b/cpp/include/cudf/column/column.hpp @@ -64,7 +64,7 @@ class column { * @param mr Device memory resource to use for all device memory allocations */ column(column const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -109,6 +109,8 @@ class column { * @note This constructor is primarily intended for use in column factory * functions. * + * @throws cudf::logic_error if `size < 0` + * * @param[in] dtype The element type * @param[in] size The number of elements in the column * @param[in] data The column's data @@ -133,6 +135,7 @@ class column { _null_count{null_count}, _children{std::move(children)} { + CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); } /** @@ -146,7 +149,7 @@ class column { * @param mr Device memory resource to use for all device memory allocations */ explicit column(column_view view, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -208,7 +211,7 @@ class column { */ void set_null_mask(rmm::device_buffer const& new_null_mask, size_type new_null_count = UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Updates the count of null elements. diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 4f9a09fb621..1361866d0aa 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -821,7 +821,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base { *`source_view` available in device memory. */ static std::unique_ptr> create( - column_view source_view, rmm::cuda_stream_view stream = cudf::default_stream_value); + column_view source_view, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Destroy the `column_device_view` object. @@ -974,7 +974,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view static std::unique_ptr> create(mutable_column_view source_view, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Returns pointer to the base device memory allocation casted to diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index 5c691d866bd..725faeae626 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -62,6 +62,7 @@ std::unique_ptr make_empty_column(type_id id); * * @throws std::bad_alloc if device memory allocation fails * @throws cudf::logic_error if `type` is not a numeric type + * @throws cudf::logic_error if `size < 0` * * @param[in] type The desired numeric element type * @param[in] size The number of elements in the column @@ -75,7 +76,7 @@ std::unique_ptr make_numeric_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -101,7 +102,7 @@ std::unique_ptr make_numeric_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); @@ -119,6 +120,7 @@ std::unique_ptr make_numeric_column( * @note The column's null count is determined by the requested null mask `state`. * * @throws cudf::logic_error if `type` is not a `fixed_point` type. + * @throws cudf::logic_error if `size < 0` * * @param[in] type The desired `fixed_point` element type. * @param[in] size The number of elements in the column. @@ -132,7 +134,7 @@ std::unique_ptr make_fixed_point_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -157,7 +159,7 @@ std::unique_ptr make_fixed_point_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); @@ -176,6 +178,7 @@ std::unique_ptr make_fixed_point_column( * * @throws std::bad_alloc if device memory allocation fails * @throws cudf::logic_error if `type` is not a timestamp type + * @throws cudf::logic_error if `size < 0` * * @param[in] type The desired timestamp element type * @param[in] size The number of elements in the column @@ -189,7 +192,7 @@ std::unique_ptr make_timestamp_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -215,7 +218,7 @@ std::unique_ptr make_timestamp_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); @@ -234,6 +237,7 @@ std::unique_ptr make_timestamp_column( * * @throws std::bad_alloc if device memory allocation fails * @throws cudf::logic_error if `type` is not a duration type + * @throws cudf::logic_error if `size < 0` * * @param[in] type The desired duration element type * @param[in] size The number of elements in the column @@ -247,7 +251,7 @@ std::unique_ptr make_duration_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -273,7 +277,7 @@ std::unique_ptr make_duration_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); @@ -292,6 +296,7 @@ std::unique_ptr make_duration_column( * * @throws std::bad_alloc if device memory allocation fails * @throws cudf::logic_error if `type` is not a fixed width type + * @throws cudf::logic_error if `size < 0` * * @param[in] type The desired fixed width type * @param[in] size The number of elements in the column @@ -305,7 +310,7 @@ std::unique_ptr make_fixed_width_column( data_type type, size_type size, mask_state state = mask_state::UNALLOCATED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -331,7 +336,7 @@ std::unique_ptr make_fixed_width_column( size_type size, B&& null_mask, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type."); @@ -366,11 +371,11 @@ std::unique_ptr make_fixed_width_column( * @param[in] stream CUDA stream used for device memory operations and kernel launches. * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children * columns' device memory. - * @return Constructed strings column + * @return Constructed strings column */ std::unique_ptr make_strings_column( cudf::device_span const> strings, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -402,7 +407,7 @@ std::unique_ptr make_strings_column( std::unique_ptr make_strings_column( cudf::device_span string_views, const string_view null_placeholder, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -438,7 +443,7 @@ std::unique_ptr make_strings_column( cudf::device_span offsets, cudf::device_span null_mask = {}, size_type null_count = cudf::UNKNOWN_NULL_COUNT, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -547,7 +552,7 @@ std::unique_ptr make_lists_column( std::unique_ptr child_column, size_type null_count, rmm::device_buffer&& null_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -578,7 +583,7 @@ std::unique_ptr make_structs_column( std::vector>&& child_columns, size_type null_count, rmm::device_buffer&& null_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -598,7 +603,7 @@ std::unique_ptr make_structs_column( std::unique_ptr make_column_from_scalar( scalar const& s, size_type size, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -618,7 +623,7 @@ std::unique_ptr make_column_from_scalar( std::unique_ptr make_dictionary_from_scalar( scalar const& s, size_type size, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 1c3ca179d17..63c66335d2d 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -140,13 +140,12 @@ std::unique_ptr reverse( * If the same index appears more than once in the scatter map, the result is * undefined. * + * If any values in `scatter_map` are outside of the interval [-n, n) where `n` + * is the number of rows in the `target` table, behavior is undefined. + * * A negative value `i` in the `scatter_map` is interpreted as `i+n`, where `n` * is the number of rows in the `target` table. * - * @throws cudf::logic_error if `check_bounds == true` and an index exists in - * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in - * the target table. If `check_bounds == false`, the behavior is undefined. - * * @param source The input columns containing values to be scattered into the * target columns * @param scatter_map A non-nullable column of integral indices that maps the @@ -154,8 +153,6 @@ std::unique_ptr reverse( * to or less than the number of elements in the source columns. * @param target The set of columns into which values from the source_table * are to be scattered - * @param check_bounds Optionally perform bounds checking on the values of - * `scatter_map` and throw an error if any of its values are out of bounds. * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target */ @@ -163,7 +160,6 @@ std::unique_ptr scatter( table_view const& source, column_view const& scatter_map, table_view const& target, - bool check_bounds = false, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -184,9 +180,8 @@ std::unique_ptr
scatter( * If the same index appears more than once in the scatter map, the result is * undefined. * - * @throws cudf::logic_error if `check_bounds == true` and an index exists in - * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in - * the target table. If `check_bounds == false`, the behavior is undefined. + * If any values in `scatter_map` are outside of the interval [-n, n) where `n` + * is the number of rows in the `target` table, behavior is undefined. * * @param source The input scalars containing values to be scattered into the * target columns @@ -194,8 +189,6 @@ std::unique_ptr
scatter( * the rows in the target table to be replaced by source. * @param target The set of columns into which values from the source_table * are to be scattered - * @param check_bounds Optionally perform bounds checking on the values of - * `scatter_map` and throw an error if any of its values are out of bounds. * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target */ @@ -203,7 +196,6 @@ std::unique_ptr
scatter( std::vector> const& source, column_view const& indices, table_view const& target, - bool check_bounds = false, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -1020,12 +1012,19 @@ bool has_nonempty_nulls(column_view const& input); bool may_have_nonempty_nulls(column_view const& input); /** - * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * @brief Copy `input` into output while purging any non-empty null rows in the column or its + * descendants. * - * LIST columns may have non-empty null rows. - * For example: - * @code{.pseudo} + * If the input column is not of compound type (LIST/STRING/STRUCT/DICTIONARY), the output will be + * the same as input. + * + * The purge operation only applies directly to LIST and STRING columns, but it applies indirectly + * to STRUCT/DICTIONARY columns as well, since these columns may have child columns that + * are LIST or STRING. * + * Examples: + * + * @code{.pseudo} * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }.release(); * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false); * @@ -1035,33 +1034,13 @@ bool may_have_nonempty_nulls(column_view const& input); * Offsets: [0, 2, 4, 6] * Child: [0, 1, 2, 3, 4, 5] * - * After purging the contents of the list's null rows, the column's contents - * will be: + * After purging the contents of the list's null rows, the column's contents will be: * Validity: 101 * Offsets: [0, 2, 2, 4] * Child: [0, 1, 4, 5] * @endcode * - * The purge operation only applies directly to LIST and STRING columns, but it - * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns - * may have child/descendant columns that are LIST or STRING. - * - * @param input The column whose null rows are to be checked and purged - * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr Column with equivalent contents to `input`, but with - * the contents of null rows purged - */ -std::unique_ptr purge_nonempty_nulls( - lists_column_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Copies `input`, purging any non-empty null rows in the column or its descendants - * - * STRING columns may have non-empty null rows. - * For example: * @code{.pseudo} - * * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release(); * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false); * @@ -1078,26 +1057,7 @@ std::unique_ptr purge_nonempty_nulls( * Child: [A, B, E, F] * @endcode * - * The purge operation only applies directly to LIST and STRING columns, but it - * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns - * may have child/descendant columns that are LIST or STRING. - * - * @param input The column whose null rows are to be checked and purged - * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr Column with equivalent contents to `input`, but with - * the contents of null rows purged - */ -std::unique_ptr purge_nonempty_nulls( - strings_column_view const& input, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Copies `input`, purging any non-empty null rows in the column or its descendants - * - * STRUCTS columns may have null rows, with non-empty child rows. - * For example: * @code{.pseudo} - * * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }; * auto const structs = structs_column_wrapper{ {lists}, null_at(1) }; * @@ -1114,17 +1074,12 @@ std::unique_ptr purge_nonempty_nulls( * Child: [0, 1, 4, 5] * @endcode * - * The purge operation only applies directly to LIST and STRING columns, but it - * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns - * may have child/descendant columns that are LIST or STRING. - * * @param input The column whose null rows are to be checked and purged * @param mr Device memory resource used to allocate the returned column's device memory - * @return std::unique_ptr Column with equivalent contents to `input`, but with - * the contents of null rows purged + * @return A new column with equivalent contents to `input`, but with null rows purged */ std::unique_ptr purge_nonempty_nulls( - structs_column_view const& input, + column_view const& input, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index a8955ffb17c..fb04336871f 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -36,7 +36,7 @@ namespace datetime { */ /** - * @brief Extracts year from any date time type and returns an int16_t + * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -50,7 +50,7 @@ std::unique_ptr extract_year( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts month from any date time type and returns an int16_t + * @brief Extracts month from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -64,7 +64,7 @@ std::unique_ptr extract_month( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts day from any date time type and returns an int16_t + * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -78,7 +78,7 @@ std::unique_ptr extract_day( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts day from any date time type and returns an int16_t + * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -92,7 +92,7 @@ std::unique_ptr extract_weekday( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts hour from any date time type and returns an int16_t + * @brief Extracts hour from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -106,7 +106,7 @@ std::unique_ptr extract_hour( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts minute from any date time type and returns an int16_t + * @brief Extracts minute from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -120,7 +120,7 @@ std::unique_ptr extract_minute( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Extracts second from any date time type and returns an int16_t + * @brief Extracts second from any datetime type and returns an int16_t * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -133,6 +133,57 @@ std::unique_ptr extract_second( cudf::column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Extracts millisecond fraction from any datetime type and returns an int16_t + * cudf::column. + * + * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration. + * For example, the millisecond fraction of 1.234567890 seconds is 234. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t milliseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_millisecond_fraction( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts microsecond fraction from any datetime type and returns an int16_t + * cudf::column. + * + * A microsecond fraction is only the 3 digits that make up the microsecond portion of a duration. + * For example, the microsecond fraction of 1.234567890 seconds is 567. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t microseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_microsecond_fraction( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Extracts nanosecond fraction from any datetime type and returns an int16_t + * cudf::column. + * + * A nanosecond fraction is only the 3 digits that make up the nanosecond portion of a duration. + * For example, the nanosecond fraction of 1.234567890 seconds is 890. + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t nanoseconds + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_nanosecond_fraction( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group /** * @addtogroup datetime_compute @@ -141,7 +192,7 @@ std::unique_ptr extract_second( */ /** - * @brief Computes the last day of the month in date time type and returns a TIMESTAMP_DAYS + * @brief Computes the last day of the month in datetime type and returns a TIMESTAMP_DAYS * cudf::column. * * @param column cudf::column_view of the input datetime values @@ -169,7 +220,7 @@ std::unique_ptr day_of_year( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Adds or subtracts a number of months from the date time type and returns a + * @brief Adds or subtracts a number of months from the datetime type and returns a * timestamp column that is of the same type as the input `timestamps` column. * * For a given row, if the `timestamps` or the `months` column value is null, @@ -204,7 +255,7 @@ std::unique_ptr add_calendrical_months( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Adds or subtracts a number of months from the date time type and returns a + * @brief Adds or subtracts a number of months from the datetime type and returns a * timestamp column that is of the same type as the input `timestamps` column. * * For a given row, if the `timestamps` value is null, the output for that row is null. diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp index 8deac88a645..ffd8be971ab 100644 --- a/cpp/include/cudf/detail/binaryop.hpp +++ b/cpp/include/cudf/detail/binaryop.hpp @@ -35,7 +35,7 @@ std::unique_ptr binary_operation( column_view const& rhs, std::string const& ptx, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -49,7 +49,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -63,7 +63,7 @@ std::unique_ptr binary_operation( scalar const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -77,7 +77,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp index 08a37acead2..925029597a6 100644 --- a/cpp/include/cudf/detail/concatenate.hpp +++ b/cpp/include/cudf/detail/concatenate.hpp @@ -35,7 +35,7 @@ namespace detail { */ std::unique_ptr concatenate( host_span columns_to_concat, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -45,7 +45,7 @@ std::unique_ptr concatenate( */ std::unique_ptr
concatenate( host_span tables_to_concat, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/copy.cuh b/cpp/include/cudf/detail/copy.cuh deleted file mode 100644 index 348f629a51a..00000000000 --- a/cpp/include/cudf/detail/copy.cuh +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -namespace cudf::detail { - -/** - * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) - * - * @tparam ColumnViewT View type (lists_column_view, strings_column_view, or strings_column_view) - * @param stream CUDA stream used for device memory operations and kernel launches - */ -template -std::unique_ptr purge_nonempty_nulls(ColumnViewT const& input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Implement via identity gather. - auto const input_column = input.parent(); - auto const gather_begin = thrust::counting_iterator(0); - auto const gather_end = gather_begin + input_column.size(); - - auto gathered_table = cudf::detail::gather(table_view{{input_column}}, - gather_begin, - gather_end, - out_of_bounds_policy::DONT_CHECK, - stream, - mr); - return std::move(gathered_table->release()[0]); -} - -} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index a2cbe8c5238..8c3f315284d 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -77,7 +77,7 @@ ColumnView slice(ColumnView const& input, cudf::size_type begin, cudf::size_type */ std::vector slice(column_view const& input, host_span indices, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::slice(column_view const&, std::initializer_list) * @@ -85,7 +85,7 @@ std::vector slice(column_view const& input, */ std::vector slice(column_view const& input, std::initializer_list indices, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::slice(table_view const&, host_span) @@ -94,7 +94,7 @@ std::vector slice(column_view const& input, */ std::vector slice(table_view const& input, host_span indices, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::slice(table_view const&, std::initializer_list) * @@ -102,7 +102,7 @@ std::vector slice(table_view const& input, */ std::vector slice(table_view const& input, std::initializer_list indices, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::split(column_view const&, host_span) @@ -111,7 +111,7 @@ std::vector slice(table_view const& input, */ std::vector split(column_view const& input, host_span splits, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::split(column_view const&, std::initializer_list) * @@ -119,7 +119,7 @@ std::vector split(column_view const& input, */ std::vector split(column_view const& input, std::initializer_list splits, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::split(table_view const&, host_span) @@ -128,7 +128,7 @@ std::vector split(column_view const& input, */ std::vector split(table_view const& input, host_span splits, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::split(table_view const&, std::initializer_list) * @@ -136,7 +136,7 @@ std::vector split(table_view const& input, */ std::vector split(table_view const& input, std::initializer_list splits, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::shift(column_view const&,size_type,scalar const&, @@ -148,7 +148,7 @@ std::unique_ptr shift( column_view const& input, size_type offset, scalar const& fill_value, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -189,7 +189,7 @@ std::unique_ptr segmented_shift( device_span segment_offsets, size_type offset, scalar const& fill_value, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -200,7 +200,7 @@ std::unique_ptr segmented_shift( std::vector contiguous_split( cudf::table_view const& input, std::vector const& splits, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -209,7 +209,7 @@ std::vector contiguous_split( * @param stream Optional CUDA stream on which to execute kernels **/ packed_columns pack(cudf::table_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -222,7 +222,7 @@ std::unique_ptr allocate_like( column_view const& input, size_type size, mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -235,7 +235,7 @@ std::unique_ptr copy_if_else( column_view const& lhs, column_view const& rhs, column_view const& boolean_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -248,7 +248,7 @@ std::unique_ptr copy_if_else( scalar const& lhs, column_view const& rhs, column_view const& boolean_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -261,7 +261,7 @@ std::unique_ptr copy_if_else( column_view const& lhs, scalar const& rhs, column_view const& boolean_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -274,7 +274,7 @@ std::unique_ptr copy_if_else( scalar const& lhs, scalar const& rhs, column_view const& boolean_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -287,7 +287,7 @@ std::unique_ptr
sample( size_type const n, sample_with_replacement replacement = sample_with_replacement::FALSE, int64_t const seed = 0, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -298,7 +298,7 @@ std::unique_ptr
sample( std::unique_ptr get_element( column_view const& input, size_type index, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -306,16 +306,24 @@ std::unique_ptr get_element( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -bool has_nonempty_nulls(column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value); +bool has_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream); /** * @copydoc cudf::may_have_nonempty_nulls * * @param stream CUDA stream used for device memory operations and kernel launches. */ -bool may_have_nonempty_nulls(column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value); +bool may_have_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream); + +/** + * @copydoc cudf::purge_nonempty_nulls + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr purge_nonempty_nulls( + column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index 99d9f5181c7..6eea72a1e0d 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -323,7 +323,7 @@ template std::unique_ptr
copy_if( table_view const& input, Filter filter, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_FUNC_RANGE(); diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index aaba729f2f2..22714e97dfa 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -135,7 +135,7 @@ void copy_range(SourceValueIterator source_value_begin, mutable_column_view& target, size_type target_begin, size_type target_end, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream) { CUDF_EXPECTS((target_begin <= target_end) && (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()), @@ -196,7 +196,7 @@ void copy_range_in_place(column_view const& source, size_type source_begin, size_type source_end, size_type target_begin, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::copy_range @@ -209,7 +209,7 @@ std::unique_ptr copy_range( size_type source_begin, size_type source_end, size_type target_begin, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 7a2545fbdcf..c2e3c32b65f 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -31,7 +31,7 @@ namespace detail { */ std::unique_ptr extract_year( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -41,7 +41,7 @@ std::unique_ptr extract_year( */ std::unique_ptr extract_month( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -51,7 +51,7 @@ std::unique_ptr extract_month( */ std::unique_ptr extract_day( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -61,7 +61,7 @@ std::unique_ptr extract_day( */ std::unique_ptr extract_weekday( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -71,7 +71,7 @@ std::unique_ptr extract_weekday( */ std::unique_ptr extract_hour( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -81,7 +81,7 @@ std::unique_ptr extract_hour( */ std::unique_ptr extract_minute( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -91,7 +91,40 @@ std::unique_ptr extract_minute( */ std::unique_ptr extract_second( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_millisecond_fraction( + cudf::column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_microsecond_fraction( + cudf::column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&, + * rmm::mr::device_memory_resource *) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr extract_nanosecond_fraction( + cudf::column_view const& column, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -101,7 +134,7 @@ std::unique_ptr extract_second( */ std::unique_ptr last_day_of_month( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -111,7 +144,7 @@ std::unique_ptr last_day_of_month( */ std::unique_ptr day_of_year( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -123,7 +156,7 @@ std::unique_ptr day_of_year( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::column_view const& months, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -135,7 +168,7 @@ std::unique_ptr add_calendrical_months( std::unique_ptr add_calendrical_months( cudf::column_view const& timestamps, cudf::scalar const& months, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -145,12 +178,12 @@ std::unique_ptr add_calendrical_months( */ std::unique_ptr is_leap_year( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr extract_quarter( cudf::column_view const& column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp index f236fa7fd43..e34acfff6b9 100644 --- a/cpp/include/cudf/detail/fill.hpp +++ b/cpp/include/cudf/detail/fill.hpp @@ -36,7 +36,7 @@ void fill_in_place(mutable_column_view& destination, size_type begin, size_type end, scalar const& value, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::fill @@ -48,7 +48,7 @@ std::unique_ptr fill( size_type begin, size_type end, scalar const& value, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 8bb117c3dd0..57d834e6277 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -128,7 +128,7 @@ void gather_helper(InputItr source_itr, { using map_type = typename std::iterator_traits::value_type; if (nullify_out_of_bounds) { - thrust::gather_if(rmm::exec_policy(stream), + thrust::gather_if(rmm::exec_policy_nosync(stream), gather_map_begin, gather_map_end, gather_map_begin, @@ -137,7 +137,7 @@ void gather_helper(InputItr source_itr, bounds_checker{0, source_size}); } else { thrust::gather( - rmm::exec_policy(stream), gather_map_begin, gather_map_end, source_itr, target_itr); + rmm::exec_policy_nosync(stream), gather_map_begin, gather_map_end, source_itr, target_itr); } } @@ -652,7 +652,7 @@ std::unique_ptr
gather( MapIterator gather_map_begin, MapIterator gather_map_end, out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { std::vector> destination_columns; diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp index fccad73591e..9d61a8de184 100644 --- a/cpp/include/cudf/detail/gather.hpp +++ b/cpp/include/cudf/detail/gather.hpp @@ -66,7 +66,7 @@ std::unique_ptr
gather( column_view const& gather_map, out_of_bounds_policy bounds_policy, negative_index_policy neg_indices, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -81,7 +81,7 @@ std::unique_ptr
gather( device_span const gather_map, out_of_bounds_policy bounds_policy, negative_index_policy neg_indices, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp index faf92c996d1..9e64048b7b4 100644 --- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp +++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp @@ -40,7 +40,7 @@ std::unique_ptr group_replace_nulls( cudf::column_view const& grouped_value, device_span group_labels, cudf::replace_policy replace_policy, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp index 8705bbd29cb..a5060cd3d36 100644 --- a/cpp/include/cudf/detail/groupby/sort_helper.hpp +++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -218,8 +218,6 @@ struct sort_groupby_helper { column_ptr _unsorted_keys_labels; ///< Group labels for unsorted _keys column_ptr _keys_bitmask_column; ///< Column representing rows with one or more nulls values table_view _keys; ///< Input keys to sort by - table_view _unflattened_keys; ///< Input keys, unflattened and possibly nested - structs::detail::flattened_table _flattened; ///< Support datastructures for _keys index_vector_ptr _group_offsets; ///< Indices into sorted _keys indicating starting index of each groups diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index 66cbf24e607..b7469d80a8d 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -35,24 +35,24 @@ std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, uint32_t seed = cudf::DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr murmur_hash3_32( table_view const& input, uint32_t seed = cudf::DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr spark_murmur_hash3_32( table_view const& input, uint32_t seed = cudf::DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr md5_hash( table_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /* Copyright 2005-2014 Daniel James. diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 1417be358de..5a5bbe7f683 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -16,7 +16,13 @@ #pragma once +// We disable warning 611 because the `arrow::TableBatchReader` only partially +// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext` +// triggering warning 611-D from nvcc. +#pragma nv_diag_suppress 611 #include +#pragma nv_diag_default 611 + #include #include #include @@ -34,7 +40,7 @@ namespace detail { */ std::unique_ptr
from_dlpack( DLManagedTensor const* managed_tensor, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -44,7 +50,7 @@ std::unique_ptr
from_dlpack( */ DLManagedTensor* to_dlpack( table_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); // Creating arrow as per given type_id and buffer arguments @@ -104,7 +110,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** @@ -114,7 +120,7 @@ std::shared_ptr to_arrow(table_view input, */ std::unique_ptr
from_arrow( arrow::Table const& input_table, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp index f9f42bdae1d..72a85d42eb3 100644 --- a/cpp/include/cudf/detail/is_element_valid.hpp +++ b/cpp/include/cudf/detail/is_element_valid.hpp @@ -41,7 +41,7 @@ namespace detail { bool is_element_valid_sync(column_view const& col_view, size_type element_index, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp index a0385674f36..2dfe31091ac 100644 --- a/cpp/include/cudf/detail/join.hpp +++ b/cpp/include/cudf/detail/join.hpp @@ -91,7 +91,7 @@ struct hash_join { */ hash_join(cudf::table_view const& build, cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::hash_join::inner_join diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp index 846893b70f6..f556c81c371 100644 --- a/cpp/include/cudf/detail/label_bins.hpp +++ b/cpp/include/cudf/detail/label_bins.hpp @@ -51,7 +51,7 @@ std::unique_ptr label_bins( inclusive left_inclusive, column_view const& right_edges, inclusive right_inclusive, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index 5d4f62e0feb..a0e04d7b215 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -34,7 +34,7 @@ namespace detail { rmm::device_buffer create_null_mask( size_type size, mask_state state, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -46,7 +46,7 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @brief Given a bitmask, counts the number of set (1) bits in the range diff --git a/cpp/include/cudf/detail/nvtx/nvtx3.hpp b/cpp/include/cudf/detail/nvtx/nvtx3.hpp index fb90ea668f5..c77714181ef 100644 --- a/cpp/include/cudf/detail/nvtx/nvtx3.hpp +++ b/cpp/include/cudf/detail/nvtx/nvtx3.hpp @@ -1907,7 +1907,7 @@ inline void mark(event_attributes const& attr) noexcept #define NVTX3_FUNC_RANGE_IN(D) \ static ::nvtx3::registered_message const nvtx3_func_name__{__func__}; \ static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ - ::nvtx3::domain_thread_range const nvtx3_range__{nvtx3_func_attr__}; + [[maybe_unused]] ::nvtx3::domain_thread_range const nvtx3_range__{nvtx3_func_attr__}; /** * @brief Convenience macro for generating a range in the global domain from the diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp index 82b8ff35bfc..3764b03641e 100644 --- a/cpp/include/cudf/detail/quantiles.hpp +++ b/cpp/include/cudf/detail/quantiles.hpp @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include #include @@ -35,7 +35,7 @@ std::unique_ptr quantile( interpolation interp = interpolation::LINEAR, column_view const& ordered_indices = {}, bool exact = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -50,7 +50,7 @@ std::unique_ptr
quantiles( cudf::sorted is_input_sorted = sorted::NO, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -62,7 +62,7 @@ std::unique_ptr
quantiles( std::unique_ptr percentile_approx( tdigest::tdigest_column_view const& input, column_view const& percentiles, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp index 7877fe13951..a2de286f283 100644 --- a/cpp/include/cudf/detail/reduction_functions.hpp +++ b/cpp/include/cudf/detail/reduction_functions.hpp @@ -46,7 +46,7 @@ std::unique_ptr sum( column_view const& col, data_type const output_dtype, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -67,7 +67,7 @@ std::unique_ptr min( column_view const& col, data_type const output_dtype, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -88,7 +88,7 @@ std::unique_ptr max( column_view const& col, data_type const output_dtype, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -110,7 +110,7 @@ std::unique_ptr any( column_view const& col, data_type const output_dtype, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -132,7 +132,7 @@ std::unique_ptr all( column_view const& col, data_type const output_dtype, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -154,7 +154,7 @@ std::unique_ptr product( column_view const& col, data_type const output_dtype, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -174,7 +174,7 @@ std::unique_ptr product( std::unique_ptr sum_of_squares( column_view const& col, data_type const output_dtype, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -194,7 +194,7 @@ std::unique_ptr sum_of_squares( std::unique_ptr mean( column_view const& col, data_type const output_dtype, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -217,7 +217,7 @@ std::unique_ptr variance( column_view const& col, data_type const output_dtype, cudf::size_type ddof, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -240,7 +240,7 @@ std::unique_ptr standard_deviation( column_view const& col, data_type const output_dtype, cudf::size_type ddof, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -271,7 +271,7 @@ std::unique_ptr nth_element( column_view const& col, size_type n, null_policy null_handling, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -286,7 +286,7 @@ std::unique_ptr nth_element( std::unique_ptr collect_list( column_view const& col, null_policy null_handling, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -299,7 +299,7 @@ std::unique_ptr collect_list( */ std::unique_ptr merge_lists( lists_column_view const& col, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -318,7 +318,7 @@ std::unique_ptr collect_set( null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -335,7 +335,7 @@ std::unique_ptr merge_sets( lists_column_view const& col, null_equality nulls_equal, nan_equality nans_equal, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -363,7 +363,7 @@ std::unique_ptr segmented_sum( data_type const output_dtype, null_policy null_handling, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -391,7 +391,7 @@ std::unique_ptr segmented_product( data_type const output_dtype, null_policy null_handling, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -418,7 +418,7 @@ std::unique_ptr segmented_min( data_type const output_dtype, null_policy null_handling, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -445,7 +445,7 @@ std::unique_ptr segmented_max( data_type const output_dtype, null_policy null_handling, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -473,7 +473,7 @@ std::unique_ptr segmented_any( data_type const output_dtype, null_policy null_handling, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -501,7 +501,7 @@ std::unique_ptr segmented_all( data_type const output_dtype, null_policy null_handling, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace reduction diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp index 9bd03878579..69d9705556f 100644 --- a/cpp/include/cudf/detail/repeat.hpp +++ b/cpp/include/cudf/detail/repeat.hpp @@ -36,7 +36,7 @@ std::unique_ptr
repeat( table_view const& input_table, column_view const& count, bool check_count, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -48,7 +48,7 @@ std::unique_ptr
repeat( std::unique_ptr
repeat( table_view const& input_table, size_type count, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp index 4c2c6e3b171..9721c6e9849 100644 --- a/cpp/include/cudf/detail/replace.hpp +++ b/cpp/include/cudf/detail/replace.hpp @@ -34,7 +34,7 @@ namespace detail { std::unique_ptr replace_nulls( column_view const& input, cudf::column_view const& replacement, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -46,7 +46,7 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nulls( column_view const& input, scalar const& replacement, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -58,7 +58,7 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nulls( column_view const& input, replace_policy const& replace_policy, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -70,7 +70,7 @@ std::unique_ptr replace_nulls( std::unique_ptr replace_nans( column_view const& input, column_view const& replacement, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -82,7 +82,7 @@ std::unique_ptr replace_nans( std::unique_ptr replace_nans( column_view const& input, scalar const& replacement, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -94,7 +94,7 @@ std::unique_ptr find_and_replace_all( column_view const& input_col, column_view const& values_to_replace, column_view const& replacement_values, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -104,7 +104,7 @@ std::unique_ptr find_and_replace_all( */ std::unique_ptr normalize_nans_and_zeros( column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp index be10b2c582d..ccffcbc61df 100644 --- a/cpp/include/cudf/detail/reshape.hpp +++ b/cpp/include/cudf/detail/reshape.hpp @@ -33,7 +33,7 @@ namespace detail { std::unique_ptr
tile( table_view const& input, size_type count, - rmm::cuda_stream_view = cudf::default_stream_value, + rmm::cuda_stream_view, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -43,7 +43,7 @@ std::unique_ptr
tile( */ std::unique_ptr interleave_columns( table_view const& input, - rmm::cuda_stream_view = cudf::default_stream_value, + rmm::cuda_stream_view, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp index e0bdde98c0a..dcaece2bafc 100644 --- a/cpp/include/cudf/detail/rolling.hpp +++ b/cpp/include/cudf/detail/rolling.hpp @@ -45,7 +45,7 @@ std::unique_ptr rolling_window( column_view const& following_window, size_type min_periods, rolling_aggregation const& agg, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp index 49e6c528eb3..1e5612919f4 100644 --- a/cpp/include/cudf/detail/round.hpp +++ b/cpp/include/cudf/detail/round.hpp @@ -35,7 +35,7 @@ std::unique_ptr round( column_view const& input, int32_t decimal_places, rounding_method method, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp index 13dddd3b0c8..f4b2d51d0cb 100644 --- a/cpp/include/cudf/detail/scan.hpp +++ b/cpp/include/cudf/detail/scan.hpp @@ -38,7 +38,7 @@ namespace detail { * `agg` is not Min or Max. * * @param input The input column view for the scan. - * @param agg unique_ptr to aggregation operator applied by the scan. + * @param agg Aggregation operator applied by the scan * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE. * Include nulls if null_policy::INCLUDE. Any operation with a null results in * a null. @@ -47,7 +47,7 @@ namespace detail { * @returns Column with scan results. */ std::unique_ptr scan_exclusive(column_view const& input, - std::unique_ptr const& agg, + scan_aggregation const& agg, null_policy null_handling, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); @@ -64,7 +64,7 @@ std::unique_ptr scan_exclusive(column_view const& input, * but the `agg` is not Min or Max. * * @param input The input column view for the scan. - * @param agg unique_ptr to aggregation operator applied by the scan. + * @param agg Aggregation operator applied by the scan * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE. * Include nulls if null_policy::INCLUDE. Any operation with a null results in * a null. @@ -73,7 +73,7 @@ std::unique_ptr scan_exclusive(column_view const& input, * @returns Column with scan results. */ std::unique_ptr scan_inclusive(column_view const& input, - std::unique_ptr const& agg, + scan_aggregation const& agg, null_policy null_handling, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index 09b16b11a73..c8b17e22df2 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -79,14 +79,14 @@ auto scatter_to_gather(MapIterator scatter_map_begin, // We'll use the `numeric_limits::lowest()` value for this since it should always be outside the // valid range. auto gather_map = rmm::device_uvector(gather_rows, stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), gather_map.begin(), gather_map.end(), std::numeric_limits::lowest()); // Convert scatter map to a gather map thrust::scatter( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(std::distance(scatter_map_begin, scatter_map_end)), scatter_map_begin, @@ -114,13 +114,13 @@ auto scatter_to_gather_complement(MapIterator scatter_map_begin, rmm::cuda_stream_view stream) { auto gather_map = rmm::device_uvector(gather_rows, stream); - thrust::sequence(rmm::exec_policy(stream), gather_map.begin(), gather_map.end(), 0); + thrust::sequence(rmm::exec_policy_nosync(stream), gather_map.begin(), gather_map.end(), 0); auto const out_of_bounds_begin = thrust::make_constant_iterator(std::numeric_limits::lowest()); auto const out_of_bounds_end = out_of_bounds_begin + thrust::distance(scatter_map_begin, scatter_map_end); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), out_of_bounds_begin, out_of_bounds_end, scatter_map_begin, @@ -152,7 +152,7 @@ struct column_scatterer_impl(), source.begin() + cudf::distance(scatter_map_begin, scatter_map_end), scatter_map_begin, @@ -218,14 +218,15 @@ struct column_scatterer_impl { // first combine keys so both dictionaries have the same set auto target_matched = dictionary::detail::add_keys(target, source.keys(), stream, mr); auto const target_view = dictionary_column_view(target_matched->view()); - auto source_matched = dictionary::detail::set_keys(source, target_view.keys(), stream); + auto source_matched = dictionary::detail::set_keys( + source, target_view.keys(), stream, rmm::mr::get_current_device_resource()); auto const source_view = dictionary_column_view(source_matched->view()); // now build the new indices by doing a scatter on just the matched indices auto source_itr = indexalator_factory::make_input_iterator(source_view.indices()); auto new_indices = std::make_unique(target_view.get_indices_annotated(), stream, mr); auto target_itr = indexalator_factory::make_output_iterator(new_indices->mutable_view()); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), source_itr, source_itr + std::distance(scatter_map_begin, scatter_map_end), scatter_map_begin, @@ -390,24 +391,13 @@ std::unique_ptr
scatter( MapIterator scatter_map_begin, MapIterator scatter_map_end, table_view const& target, - bool check_bounds = false, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_FUNC_RANGE(); using MapType = typename thrust::iterator_traits::value_type; - if (check_bounds) { - auto const begin = -target.num_rows(); - auto const end = target.num_rows(); - auto bounds = bounds_checker{begin, end}; - CUDF_EXPECTS( - std::distance(scatter_map_begin, scatter_map_end) == - thrust::count_if(rmm::exec_policy(stream), scatter_map_begin, scatter_map_end, bounds), - "Scatter map index out of bounds"); - } - CUDF_EXPECTS(std::distance(scatter_map_begin, scatter_map_end) <= source.num_rows(), "scatter map size should be <= to number of rows in source"); diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp index 8c993368ff2..7c4b04537ea 100644 --- a/cpp/include/cudf/detail/scatter.hpp +++ b/cpp/include/cudf/detail/scatter.hpp @@ -45,10 +45,8 @@ namespace detail { * * If the same index appears more than once in the scatter map, the result is * undefined. - * - * @throws cudf::logic_error if `check_bounds == true` and an index exists in - * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in - * the target table. If `check_bounds == false`, the behavior is undefined. + * If any values in `scatter_map` are outside of the interval [-n, n) where `n` + * is the number of rows in the `target` table, behavior is undefined. * * @param source The input columns containing values to be scattered into the * target columns @@ -57,8 +55,6 @@ namespace detail { * to or less than the number of elements in the source columns. * @param target The set of columns into which values from the source_table * are to be scattered - * @param check_bounds Optionally perform bounds checking on the values of - * `scatter_map` and throw an error if any of its values are out of bounds. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target @@ -67,8 +63,7 @@ std::unique_ptr
scatter( table_view const& source, column_view const& scatter_map, table_view const& target, - bool check_bounds = false, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -81,8 +76,7 @@ std::unique_ptr
scatter( table_view const& source, device_span const scatter_map, table_view const& target, - bool check_bounds = false, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -101,9 +95,8 @@ std::unique_ptr
scatter( * If the same index appears more than once in the scatter map, the result is * undefined. * - * @throws cudf::logic_error if `check_bounds == true` and an index exists in - * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in - * the target table. If `check_bounds == false`, the behavior is undefined. + * If any values in `indices` are outside of the interval [-n, n) where `n` + * is the number of rows in the `target` table, behavior is undefined. * * @param source The input scalars containing values to be scattered into the * target columns @@ -111,8 +104,6 @@ std::unique_ptr
scatter( * the rows in the target table to be replaced by source. * @param target The set of columns into which values from the source_table * are to be scattered - * @param check_bounds Optionally perform bounds checking on the values of - * `scatter_map` and throw an error if any of its values are out of bounds. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned table's device memory * @return Result of scattering values from source to target @@ -121,8 +112,7 @@ std::unique_ptr
scatter( std::vector> const& source, column_view const& indices, table_view const& target, - bool check_bounds = false, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -137,7 +127,7 @@ std::unique_ptr
boolean_mask_scatter( table_view const& source, table_view const& target, column_view const& boolean_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -153,7 +143,7 @@ std::unique_ptr
boolean_mask_scatter( std::vector> const& source, table_view const& target, column_view const& boolean_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp index 8b3ef46d0ad..4a9bf5c74e1 100644 --- a/cpp/include/cudf/detail/sequence.hpp +++ b/cpp/include/cudf/detail/sequence.hpp @@ -36,7 +36,7 @@ std::unique_ptr sequence( size_type size, scalar const& init, scalar const& step, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -49,7 +49,7 @@ std::unique_ptr sequence( std::unique_ptr sequence( size_type size, scalar const& init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -64,7 +64,7 @@ std::unique_ptr calendrical_month_sequence( size_type size, scalar const& init, size_type months, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp index a68407d9194..66b3f5071c6 100644 --- a/cpp/include/cudf/detail/sorting.hpp +++ b/cpp/include/cudf/detail/sorting.hpp @@ -36,7 +36,7 @@ std::unique_ptr sorted_order( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -48,7 +48,7 @@ std::unique_ptr stable_sorted_order( table_view const& input, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -61,7 +61,7 @@ std::unique_ptr
sort_by_key( table_view const& keys, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -74,7 +74,7 @@ std::unique_ptr
stable_sort_by_key( table_view const& keys, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -87,7 +87,7 @@ std::unique_ptr segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -100,7 +100,7 @@ std::unique_ptr stable_segmented_sorted_order( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -114,7 +114,7 @@ std::unique_ptr
segmented_sort_by_key( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -128,7 +128,7 @@ std::unique_ptr
stable_segmented_sort_by_key( column_view const& segment_offsets, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -140,7 +140,7 @@ std::unique_ptr
sort( table_view const& values, std::vector const& column_order = {}, std::vector const& null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 0db929c523c..e725718ed22 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -36,7 +36,7 @@ std::unique_ptr
drop_nulls( table_view const& input, std::vector const& keys, cudf::size_type keep_threshold, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -49,7 +49,7 @@ std::unique_ptr
drop_nans( table_view const& input, std::vector const& keys, cudf::size_type keep_threshold, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -60,7 +60,7 @@ std::unique_ptr
drop_nans( std::unique_ptr
apply_boolean_mask( table_view const& input, column_view const& boolean_mask, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -73,7 +73,7 @@ std::unique_ptr
unique( std::vector const& keys, duplicate_keep_option keep, null_equality nulls_equal = null_equality::EQUAL, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -87,7 +87,7 @@ std::unique_ptr
distinct( duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -116,7 +116,7 @@ std::unique_ptr
stable_distinct( duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -138,7 +138,7 @@ rmm::device_uvector get_distinct_indices( duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, null_equality nulls_equal = null_equality::EQUAL, nan_equality nans_equal = nan_equality::ALL_EQUAL, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -149,7 +149,7 @@ rmm::device_uvector get_distinct_indices( cudf::size_type unique_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::unique_count(table_view const&, null_equality) @@ -158,7 +158,7 @@ cudf::size_type unique_count(column_view const& input, */ cudf::size_type unique_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy) @@ -168,7 +168,7 @@ cudf::size_type unique_count(table_view const& input, cudf::size_type distinct_count(column_view const& input, null_policy null_handling, nan_policy nan_handling, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream); /** * @copydoc cudf::distinct_count(table_view const&, null_equality) @@ -177,7 +177,7 @@ cudf::size_type distinct_count(column_view const& input, */ cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal = null_equality::EQUAL, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp index 1a4b8f02dd3..115c8ccd90e 100644 --- a/cpp/include/cudf/detail/structs/utilities.hpp +++ b/cpp/include/cudf/detail/structs/utilities.hpp @@ -189,7 +189,7 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask, */ std::tuple> superimpose_parent_nulls( column_view const& parent, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -215,7 +215,7 @@ std::tuple> superimpose_paren */ std::tuple> superimpose_parent_nulls( table_view const& table, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp index 41e734ffe83..9df3f9daf3f 100644 --- a/cpp/include/cudf/detail/tdigest/tdigest.hpp +++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp @@ -23,9 +23,8 @@ #include namespace cudf { -namespace detail { - namespace tdigest { +namespace detail { /** * @brief Generate a tdigest column from a grouped set of numeric input values. @@ -139,7 +138,7 @@ std::unique_ptr make_tdigest_column( std::unique_ptr&& tdigest_offsets, std::unique_ptr&& min_values, std::unique_ptr&& max_values, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -153,7 +152,7 @@ std::unique_ptr make_tdigest_column( * @returns An empty tdigest column. */ std::unique_ptr make_empty_tdigest_column( - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -167,7 +166,7 @@ std::unique_ptr make_empty_tdigest_column( * @returns An empty tdigest scalar. */ std::unique_ptr make_empty_tdigest_scalar( - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -328,6 +327,6 @@ std::unique_ptr reduce_merge_tdigest(column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -} // namespace tdigest } // namespace detail +} // namespace tdigest } // namespace cudf diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp index 929c4700873..8e19ebb8da7 100644 --- a/cpp/include/cudf/detail/transform.hpp +++ b/cpp/include/cudf/detail/transform.hpp @@ -34,7 +34,7 @@ std::unique_ptr transform( std::string const& unary_udf, data_type output_type, bool is_ptx, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -45,7 +45,7 @@ std::unique_ptr transform( std::unique_ptr compute_column( table_view const table, ast::operation const& expr, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -55,7 +55,7 @@ std::unique_ptr compute_column( */ std::pair, size_type> nans_to_nulls( column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -65,7 +65,7 @@ std::pair, size_type> nans_to_nulls( */ std::pair, cudf::size_type> bools_to_mask( column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -75,7 +75,7 @@ std::pair, cudf::size_type> bools_to_mask( */ std::pair, std::unique_ptr> encode( cudf::table_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -86,7 +86,7 @@ std::pair, std::unique_ptr> encode( std::pair, table_view> one_hot_encode( column_view const& input, column_view const& categories, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -98,7 +98,7 @@ std::unique_ptr mask_to_bools( bitmask_type const* null_mask, size_type begin_bit, size_type end_bit, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -108,7 +108,7 @@ std::unique_ptr mask_to_bools( */ std::unique_ptr row_bit_count( table_view const& t, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp index 367421a5ee1..0470d625edc 100644 --- a/cpp/include/cudf/detail/transpose.hpp +++ b/cpp/include/cudf/detail/transpose.hpp @@ -30,7 +30,7 @@ namespace detail { */ std::pair, table_view> transpose( table_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp index 5d1c29aba78..0e1c047d9b0 100644 --- a/cpp/include/cudf/detail/unary.hpp +++ b/cpp/include/cudf/detail/unary.hpp @@ -50,7 +50,7 @@ std::unique_ptr true_if( InputIterator end, size_type size, Predicate p, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto output = @@ -71,7 +71,7 @@ std::unique_ptr true_if( std::unique_ptr unary_operation( cudf::column_view const& input, cudf::unary_operator op, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -82,7 +82,7 @@ std::unique_ptr unary_operation( std::unique_ptr cast( column_view const& input, data_type type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -92,7 +92,7 @@ std::unique_ptr cast( */ std::unique_ptr is_nan( cudf::column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -102,7 +102,7 @@ std::unique_ptr is_nan( */ std::unique_ptr is_not_nan( cudf::column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/detail/utilities/algorithm.cuh b/cpp/include/cudf/detail/utilities/algorithm.cuh index f05a09a8df1..4e83e219072 100644 --- a/cpp/include/cudf/detail/utilities/algorithm.cuh +++ b/cpp/include/cudf/detail/utilities/algorithm.cuh @@ -25,4 +25,4 @@ __device__ __forceinline__ T accumulate(Iterator first, Iterator last, T init, B } return init; } -} // namespace cudf::detail \ No newline at end of file +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh index d57078f892f..cdbc26701d1 100644 --- a/cpp/include/cudf/detail/utilities/cuda.cuh +++ b/cpp/include/cudf/detail/utilities/cuda.cuh @@ -170,8 +170,7 @@ __global__ void single_thread_kernel(F f) * @param stream CUDA stream used for the kernel launch */ template -void device_single_thread(Functor functor, - rmm::cuda_stream_view stream = cudf::default_stream_value) +void device_single_thread(Functor functor, rmm::cuda_stream_view stream) { single_thread_kernel<<<1, 1, 0, stream.value()>>>(functor); } diff --git a/cpp/tests/strings/utilities.h b/cpp/include/cudf/detail/utilities/default_stream.hpp similarity index 64% rename from cpp/tests/strings/utilities.h rename to cpp/include/cudf/detail/utilities/default_stream.hpp index d6f0e9c4f1f..fa438f142b7 100644 --- a/cpp/tests/strings/utilities.h +++ b/cpp/include/cudf/detail/utilities/default_stream.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,18 +13,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once -#include +#include +#include namespace cudf { -namespace test { + +namespace detail { + /** - * @brief Utility will verify the given strings column is empty. + * @brief Default stream for cudf * - * @param strings_column Column of strings to check + * Use this value to ensure the correct stream is used when compiled with per + * thread default stream. */ -void expect_strings_empty(cudf::column_view strings_column); +extern rmm::cuda_stream_view const default_stream_value; + +} // namespace detail -} // namespace test } // namespace cudf diff --git a/cpp/include/cudf/detail/utilities/linked_column.hpp b/cpp/include/cudf/detail/utilities/linked_column.hpp index 05b46cc8e13..059e32730e5 100644 --- a/cpp/include/cudf/detail/utilities/linked_column.hpp +++ b/cpp/include/cudf/detail/utilities/linked_column.hpp @@ -77,4 +77,4 @@ inline LinkedColVector table_to_linked_columns(table_view const& table) return LinkedColVector(linked_it, linked_it + table.num_columns()); } -} // namespace cudf::detail \ No newline at end of file +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/pinned_allocator.hpp b/cpp/include/cudf/detail/utilities/pinned_allocator.hpp new file mode 100644 index 00000000000..84abf7c014f --- /dev/null +++ b/cpp/include/cudf/detail/utilities/pinned_allocator.hpp @@ -0,0 +1,202 @@ +/* + * Copyright 2008-2022 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include // for bad_alloc + +#include + +namespace cudf::detail { + +/*! \p pinned_allocator is a CUDA-specific host memory allocator + * that employs \c cudaMallocHost for allocation. + * + * This implementation is ported from the experimental/pinned_allocator + * that Thrust used to provide. + * + * \see https://en.cppreference.com/w/cpp/memory/allocator + */ +template +class pinned_allocator; + +/*! \p pinned_allocator is a CUDA-specific host memory allocator + * that employs \c cudaMallocHost for allocation. + * + * This implementation is ported from the experimental/pinned_allocator + * that Thrust used to provide. + * + * \see https://en.cppreference.com/w/cpp/memory/allocator + */ +template <> +class pinned_allocator { + public: + using value_type = void; ///< The type of the elements in the allocator + using pointer = void*; ///< The type returned by address() / allocate() + using const_pointer = const void*; ///< The type returned by address() + using size_type = std::size_t; ///< The type used for the size of the allocation + using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers + + /** + * @brief converts a `pinned_allocator` to `pinned_allocator` + */ + template + struct rebind { + using other = pinned_allocator; ///< The rebound type + }; +}; + +/*! \p pinned_allocator is a CUDA-specific host memory allocator + * that employs \c cudaMallocHost for allocation. + * + * This implementation is ported from the experimental/pinned_allocator + * that Thrust used to provide. + * + * \see https://en.cppreference.com/w/cpp/memory/allocator + */ +template +class pinned_allocator { + public: + using value_type = T; ///< The type of the elements in the allocator + using pointer = T*; ///< The type returned by address() / allocate() + using const_pointer = const T*; ///< The type returned by address() + using reference = T&; ///< The parameter type for address() + using const_reference = const T&; ///< The parameter type for address() + using size_type = std::size_t; ///< The type used for the size of the allocation + using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers + + /** + * @brief converts a `pinned_allocator` to `pinned_allocator` + */ + template + struct rebind { + using other = pinned_allocator; ///< The rebound type + }; + + /** + * @brief pinned_allocator's null constructor does nothing. + */ + __host__ __device__ inline pinned_allocator() {} + + /** + * @brief pinned_allocator's null destructor does nothing. + */ + __host__ __device__ inline ~pinned_allocator() {} + + /** + * @brief pinned_allocator's copy constructor does nothing. + */ + __host__ __device__ inline pinned_allocator(pinned_allocator const&) {} + + /** + * @brief pinned_allocator's copy constructor does nothing. + * + * This version of pinned_allocator's copy constructor + * is templated on the \c value_type of the pinned_allocator + * to copy from. It is provided merely for convenience; it + * does nothing. + */ + template + __host__ __device__ inline pinned_allocator(pinned_allocator const&) + { + } + + /** + * @brief This method returns the address of a \c reference of + * interest. + * + * @param r The \c reference of interest. + * @return \c r's address. + */ + __host__ __device__ inline pointer address(reference r) { return &r; } + + /** + * @brief This method returns the address of a \c const_reference + * of interest. + * + * @param r The \c const_reference of interest. + * @return \c r's address. + */ + __host__ __device__ inline const_pointer address(const_reference r) { return &r; } + + /** + * @brief This method allocates storage for objects in pinned host + * memory. + * + * @param cnt The number of objects to allocate. + * @return a \c pointer to the newly allocated objects. + * @note The second parameter to this function is meant as a + * hint pointer to a nearby memory location, but is + * not used by this allocator. + * @note This method does not invoke \p value_type's constructor. + * It is the responsibility of the caller to initialize the + * objects at the returned \c pointer. + */ + __host__ inline pointer allocate(size_type cnt, const_pointer /*hint*/ = 0) + { + if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if + + pointer result(0); + CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast(&result), cnt * sizeof(value_type))); + return result; + } + + /** + * @brief This method deallocates pinned host memory previously allocated + * with this \c pinned_allocator. + * + * @param p A \c pointer to the previously allocated memory. + * @note The second parameter is the number of objects previously allocated + * but is ignored by this allocator. + * @note This method does not invoke \p value_type's destructor. + * It is the responsibility of the caller to destroy + * the objects stored at \p p. + */ + __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); } + + /** + * @brief This method returns the maximum size of the \c cnt parameter + * accepted by the \p allocate() method. + * + * @return The maximum number of objects that may be allocated + * by a single call to \p allocate(). + */ + inline size_type max_size() const { return (std::numeric_limits::max)() / sizeof(T); } + + /** + * @brief This method tests this \p pinned_allocator for equality to + * another. + * + * @param x The other \p pinned_allocator of interest. + * @return This method always returns \c true. + */ + __host__ __device__ inline bool operator==(pinned_allocator const& x) const { return true; } + + /** + * @brief This method tests this \p pinned_allocator for inequality + * to another. + * + * @param x The other \p pinned_allocator of interest. + * @return This method always returns \c false. + */ + __host__ __device__ inline bool operator!=(pinned_allocator const& x) const + { + return !operator==(x); + } +}; +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index d7fdb153c19..75e5222ab97 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -72,7 +72,7 @@ rmm::device_uvector make_zeroed_device_uvector_async( template rmm::device_uvector make_zeroed_device_uvector_sync( std::size_t size, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { rmm::device_uvector ret(size, stream, mr); @@ -148,7 +148,7 @@ rmm::device_uvector make_device_uvector_async( template rmm::device_uvector make_device_uvector_async( device_span source_data, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { rmm::device_uvector ret(source_data.size(), stream, mr); @@ -201,7 +201,7 @@ rmm::device_uvector make_device_uvector_async( template rmm::device_uvector make_device_uvector_sync( host_span source_data, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto ret = make_device_uvector_async(source_data, stream, mr); @@ -228,7 +228,7 @@ template < std::is_convertible_v>>* = nullptr> rmm::device_uvector make_device_uvector_sync( Container const& c, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { return make_device_uvector_sync(host_span{c}, stream, mr); @@ -249,7 +249,7 @@ rmm::device_uvector make_device_uvector_sync( template rmm::device_uvector make_device_uvector_sync( device_span source_data, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto ret = make_device_uvector_async(source_data, stream, mr); @@ -276,7 +276,7 @@ template < std::is_convertible_v>>* = nullptr> rmm::device_uvector make_device_uvector_sync( Container const& c, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { return make_device_uvector_sync(device_span{c}, stream, mr); @@ -366,8 +366,8 @@ template < typename Container, std::enable_if_t< std::is_convertible_v>>* = nullptr> -std::vector make_std_vector_sync( - Container const& c, rmm::cuda_stream_view stream = cudf::default_stream_value) +std::vector make_std_vector_sync(Container const& c, + rmm::cuda_stream_view stream) { return make_std_vector_sync(device_span{c}, stream); } @@ -423,8 +423,7 @@ thrust::host_vector make_host_vector_async( * @return The data copied to the host */ template -thrust::host_vector make_host_vector_sync( - device_span v, rmm::cuda_stream_view stream = cudf::default_stream_value) +thrust::host_vector make_host_vector_sync(device_span v, rmm::cuda_stream_view stream) { auto result = make_host_vector_async(v, stream); stream.synchronize(); @@ -448,7 +447,7 @@ template < std::enable_if_t< std::is_convertible_v>>* = nullptr> thrust::host_vector make_host_vector_sync( - Container const& c, rmm::cuda_stream_view stream = cudf::default_stream_value) + Container const& c, rmm::cuda_stream_view stream) { return make_host_vector_sync(device_span{c}, stream); } diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index 0fe7edad21d..04c78bed17d 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -90,7 +90,7 @@ std::pair valid_if( InputIterator begin, InputIterator end, Predicate p, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(begin <= end, "Invalid range."); diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp index 9f154a054f8..d74429484ce 100644 --- a/cpp/include/cudf/dictionary/detail/concatenate.hpp +++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp @@ -37,10 +37,9 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ -std::unique_ptr concatenate( - host_span columns, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr concatenate(host_span columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace dictionary diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp index 17173564a9a..2aad7dd80ed 100644 --- a/cpp/include/cudf/dictionary/detail/encode.hpp +++ b/cpp/include/cudf/dictionary/detail/encode.hpp @@ -51,11 +51,10 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return Returns a dictionary column. */ -std::unique_ptr encode( - column_view const& column, - data_type indices_type = data_type{type_id::UINT32}, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr encode(column_view const& column, + data_type indices_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Create a column by gathering the keys from the provided @@ -72,10 +71,9 @@ std::unique_ptr encode( * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with type matching the dictionary_column's keys. */ -std::unique_ptr decode( - dictionary_column_view const& dictionary_column, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr decode(dictionary_column_view const& dictionary_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Return minimal integer type for the given number of elements. diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp index 2b38a6c40ec..0778baa84d6 100644 --- a/cpp/include/cudf/dictionary/detail/replace.hpp +++ b/cpp/include/cudf/dictionary/detail/replace.hpp @@ -39,11 +39,10 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column with null rows replaced. */ -std::unique_ptr replace_nulls( - dictionary_column_view const& input, - dictionary_column_view const& replacement, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace_nulls(dictionary_column_view const& input, + dictionary_column_view const& replacement, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Create a new dictionary column by replacing nulls with a @@ -57,11 +56,10 @@ std::unique_ptr replace_nulls( * @param mr Device memory resource used to allocate the returned column's device memory. * @return New dictionary column with null rows replaced. */ -std::unique_ptr replace_nulls( - dictionary_column_view const& input, - scalar const& replacement, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace_nulls(dictionary_column_view const& input, + scalar const& replacement, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace dictionary diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp index 4f7939b32a7..62059306b9a 100644 --- a/cpp/include/cudf/dictionary/detail/search.hpp +++ b/cpp/include/cudf/dictionary/detail/search.hpp @@ -31,11 +31,10 @@ namespace detail { * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr get_index( - dictionary_column_view const& dictionary, - scalar const& key, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr get_index(dictionary_column_view const& dictionary, + scalar const& key, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Get the index for a key if it were added to the given dictionary. @@ -56,11 +55,10 @@ std::unique_ptr get_index( * @param mr Device memory resource used to allocate the returned column's device memory. * @return Numeric scalar index value of the key within the dictionary */ -std::unique_ptr get_insert_index( - dictionary_column_view const& dictionary, - scalar const& key, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr get_insert_index(dictionary_column_view const& dictionary, + scalar const& key, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace dictionary diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp index 53fd71e0375..6fd743ad526 100644 --- a/cpp/include/cudf/dictionary/detail/update_keys.hpp +++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp @@ -32,11 +32,10 @@ namespace detail { * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr add_keys( - dictionary_column_view const& dictionary_column, - column_view const& new_keys, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr add_keys(dictionary_column_view const& dictionary_column, + column_view const& new_keys, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view @@ -44,11 +43,10 @@ std::unique_ptr add_keys( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr remove_keys( - dictionary_column_view const& dictionary_column, - column_view const& keys_to_remove, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr remove_keys(dictionary_column_view const& dictionary_column, + column_view const& keys_to_remove, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view @@ -56,10 +54,9 @@ std::unique_ptr remove_keys( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr remove_unused_keys( - dictionary_column_view const& dictionary_column, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr remove_unused_keys(dictionary_column_view const& dictionary_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc cudf::dictionary::set_keys(dictionary_column_view @@ -67,11 +64,10 @@ std::unique_ptr remove_unused_keys( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr set_keys( - dictionary_column_view const& dictionary_column, - column_view const& keys, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr set_keys(dictionary_column_view const& dictionary_column, + column_view const& keys, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc @@ -81,8 +77,8 @@ std::unique_ptr set_keys( */ std::vector> match_dictionaries( cudf::host_span input, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Create new dictionaries that have keys merged from dictionary columns @@ -105,8 +101,8 @@ std::vector> match_dictionaries( */ std::pair>, std::vector> match_dictionaries( std::vector tables, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace dictionary diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp index b27fa25a27a..821981ad148 100644 --- a/cpp/include/cudf/dictionary/dictionary_factories.hpp +++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp @@ -65,7 +65,7 @@ namespace cudf { std::unique_ptr make_dictionary_column( column_view const& keys_column, column_view const& indices_column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -117,7 +117,7 @@ std::unique_ptr make_dictionary_column(std::unique_ptr keys_colu std::unique_ptr make_dictionary_column( std::unique_ptr keys_column, std::unique_ptr indices_column, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp index 50b81187091..fb13eabe11a 100644 --- a/cpp/include/cudf/dictionary/encode.hpp +++ b/cpp/include/cudf/dictionary/encode.hpp @@ -31,7 +31,7 @@ namespace dictionary { */ /** - * @brief Construct a dictionary column by dictionary encoding an existing column. + * @brief Construct a dictionary column by dictionary encoding an existing column * * The output column is a DICTIONARY type with a keys column of non-null, unique values * that are in a strict, total order. Meaning, `keys[i]` is _ordered before @@ -40,21 +40,21 @@ namespace dictionary { * The output column has a child indices column that is of integer type and with * the same size as the input column. * - * The null_mask and null count are copied from the input column to the output column. + * The null mask and null count are copied from the input column to the output column. * - * @throw cudf::logic_error if indices type is not an unsigned integer type. - * @throw cudf::logic_error if the column to encode is already a DICTIONARY type. + * @throw cudf::logic_error if indices type is not an unsigned integer type + * @throw cudf::logic_error if the column to encode is already a DICTIONARY type * * @code{.pseudo} - * c = [429,111,213,111,213,429,213] - * d = make_dictionary_column(c) - * d now has keys [111,213,429] and indices [2,0,1,0,1,2,1] + * c = [429, 111, 213, 111, 213, 429, 213] + * d = encode(c) + * d now has keys [111, 213, 429] and indices [2, 0, 1, 0, 1, 2, 1] * @endcode * - * @param column The column to dictionary encode. - * @param indices_type The integer type to use for the indices. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return Returns a dictionary column. + * @param column The column to dictionary encode + * @param indices_type The integer type to use for the indices + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Returns a dictionary column */ std::unique_ptr encode( column_view const& column, @@ -66,14 +66,14 @@ std::unique_ptr encode( * dictionary_column into a new column using the indices from that column. * * @code{.pseudo} - * d1 = {["a","c","d"],[2,0,1,0]} + * d1 = {["a", "c", "d"], [2, 0, 1, 0]} * s = decode(d1) - * s is now ["d","a","c","a"] + * s is now ["d", "a", "c", "a"] * @endcode * - * @param dictionary_column Existing dictionary column. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with type matching the dictionary_column's keys. + * @param dictionary_column Existing dictionary column + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column with type matching the dictionary_column's keys */ std::unique_ptr decode( dictionary_column_view const& dictionary_column, diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp index 5f9d13f9a2c..8688e97ab7e 100644 --- a/cpp/include/cudf/filling.hpp +++ b/cpp/include/cudf/filling.hpp @@ -103,26 +103,22 @@ std::unique_ptr fill( * ``` * @p count should not have null values; should not contain negative values; * and the sum of count elements should not overflow the size_type's limit. - * It is undefined behavior if @p count has negative values or the sum overflows - * and @p check_count is set to false. + * The behavior of this function is undefined if @p count has negative values + * or the sum overflows. * * @throws cudf::logic_error if the data type of @p count is not size_type. * @throws cudf::logic_error if @p input_table and @p count have different * number of rows. * @throws cudf::logic_error if @p count has null values. - * @throws cudf::logic_error if @p check_count is set to true and @p count - * has negative values or the sum of @p count elements overflows. * * @param input_table Input table * @param count Non-nullable column of an integral type - * @param check_count Whether to check count (negative values and overflow) * @param mr Device memory resource used to allocate the returned table's device memory * @return The result table containing the repetitions */ std::unique_ptr
repeat( table_view const& input_table, column_view const& count, - bool check_count = false, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 016e23688c7..c1c58f136d6 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -16,7 +16,12 @@ #pragma once +// We disable warning 611 because the `arrow::TableBatchReader` only partially +// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext` +// triggering warning 611-D from nvcc. +#pragma nv_diag_suppress 611 #include +#pragma nv_diag_default 611 #include #include diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index f753028a148..1fc4114b94c 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1338,8 +1338,8 @@ class csv_writer_options { std::string _true_value = std::string{"true"}; // string to use for values == 0 in INT8 types (default 'false') std::string _false_value = std::string{"false"}; - // Optional associated metadata - table_metadata const* _metadata = nullptr; + // Names of all columns; if empty, writer will generate column names + std::vector _names; /** * @brief Constructor from sink and table. @@ -1387,11 +1387,11 @@ class csv_writer_options { [[nodiscard]] table_view const& get_table() const { return _table; } /** - * @brief Returns optional associated metadata. + * @brief Returns names of the columns. * - * @return Optional associated metadata + * @return Names of the columns in the output file */ - [[nodiscard]] table_metadata const* get_metadata() const { return _metadata; } + [[nodiscard]] std::vector const& get_names() const { return _names; } /** * @brief Returns string to used for null entries. @@ -1444,11 +1444,11 @@ class csv_writer_options { // Setter /** - * @brief Sets optional associated metadata. + * @brief Sets optional associated column names. * - @param metadata Associated metadata + @param names Associated column names */ - void set_metadata(table_metadata* metadata) { _metadata = metadata; } + void set_names(std::vector names) { _names = std::move(names); } /** * @brief Sets string to used for null entries. @@ -1526,14 +1526,14 @@ class csv_writer_options_builder { } /** - * @brief Sets optional associated metadata. + * @brief Sets optional column names. * - * @param metadata Associated metadata + * @param names Column names * @return this for chaining */ - csv_writer_options_builder& metadata(table_metadata* metadata) + csv_writer_options_builder& names(std::vector names) { - options._metadata = metadata; + options._names = names; return *this; } diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 907830de2bb..251a93ac21f 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -22,8 +22,15 @@ #include #include + +// We disable warning 611 because some Arrow subclasses of +// `arrow::fs::FileSystem` only partially override the `Equals` method, +// triggering warning 611-D from nvcc. +#pragma nv_diag_suppress 611 #include #include +#pragma nv_diag_default 611 + #include #include #include diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp index 9551b1f05df..c141e25f939 100644 --- a/cpp/include/cudf/io/detail/avro.hpp +++ b/cpp/include/cudf/io/detail/avro.hpp @@ -39,7 +39,7 @@ namespace avro { table_with_metadata read_avro( std::unique_ptr&& source, avro_reader_options const& options, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace avro diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 0d79ecd0d77..90d730338fc 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -46,16 +46,16 @@ table_with_metadata read_csv(std::unique_ptr&& source, * * @param sink Output sink * @param table The set of columns - * @param metadata The metadata associated with the table + * @param column_names Column names for the output CSV * @param options Settings for controlling behavior * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource to use for device memory allocation */ void write_csv(data_sink* sink, table_view const& table, - const table_metadata* metadata, + host_span column_names, csv_writer_options const& options, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace csv diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh index 628c00ad603..aba9ec07bc6 100644 --- a/cpp/include/cudf/io/detail/data_casting.cuh +++ b/cpp/include/cudf/io/detail/data_casting.cuh @@ -391,10 +391,13 @@ std::unique_ptr parse_data(str_tuple_it str_tuples, return; } + // If this is a string value, remove quotes + auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar); + auto const is_parsed = cudf::type_dispatcher(col_type, ConvertFunctor{}, - in.first, - in.first + in.second, + in_begin, + in_end, col.data(), row, col_type, diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 3e69ef8a3b8..42717fe36df 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -39,7 +39,7 @@ namespace json { table_with_metadata read_json( std::vector>& sources, json_reader_options const& options, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace json diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp index 79fcf4bd916..4c78502a21b 100644 --- a/cpp/include/cudf/io/detail/orc.hpp +++ b/cpp/include/cudf/io/detail/orc.hpp @@ -74,8 +74,7 @@ class reader { * * @return The set of columns along with table metadata */ - table_with_metadata read(orc_reader_options const& options, - rmm::cuda_stream_view stream = cudf::default_stream_value); + table_with_metadata read(orc_reader_options const& options, rmm::cuda_stream_view stream); }; /** diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 7675dc70cb2..7f107017864 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -30,25 +30,28 @@ #include #include -namespace cudf { -namespace io { +namespace cudf::io { // Forward declaration class parquet_reader_options; class parquet_writer_options; class chunked_parquet_writer_options; -namespace detail { -namespace parquet { +namespace detail::parquet { /** * @brief Class to read Parquet dataset data into columns. */ class reader { - private: + protected: class impl; std::unique_ptr _impl; + /** + * @brief Default constructor, needed for subclassing. + */ + reader(); + public: /** * @brief Constructor from an array of datasources @@ -66,7 +69,7 @@ class reader { /** * @brief Destructor explicitly-declared to avoid inlined in header */ - ~reader(); + virtual ~reader(); /** * @brief Reads the dataset as per given options. @@ -78,6 +81,62 @@ class reader { table_with_metadata read(parquet_reader_options const& options); }; +/** + * @brief The reader class that supports iterative reading of a given file. + * + * This class intentionally subclasses the `reader` class with private inheritance to hide the + * `reader::read()` API. As such, only chunked reading APIs are supported. + */ +class chunked_reader : private reader { + public: + /** + * @brief Constructor from a read size limit and an array of data sources with reader options. + * + * The typical usage should be similar to this: + * ``` + * do { + * auto const chunk = reader.read_chunk(); + * // Process chunk + * } while (reader.has_next()); + * + * ``` + * + * If `chunk_read_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the + * whole file and return a table containing all rows. + * + * @param chunk_read_limit Limit on total number of bytes to be returned per read, + * or `0` if there is no limit + * @param sources Input `datasource` objects to read the dataset from + * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to use for device memory allocation + */ + explicit chunked_reader(std::size_t chunk_read_limit, + std::vector>&& sources, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + + /** + * @brief Destructor explicitly-declared to avoid inlined in header. + * + * Since the declaration of the internal `_impl` object does not exist in this header, this + * destructor needs to be defined in a separate source file which can access to that object's + * declaration. + */ + ~chunked_reader(); + + /** + * @copydoc cudf::io::chunked_parquet_reader::has_next + */ + [[nodiscard]] bool has_next() const; + + /** + * @copydoc cudf::io::chunked_parquet_reader::read_chunk + */ + [[nodiscard]] table_with_metadata read_chunk() const; +}; + /** * @brief Class to write parquet dataset data into columns. */ @@ -154,7 +213,5 @@ class writer { const std::vector>>& metadata_list); }; -}; // namespace parquet -}; // namespace detail -}; // namespace io -}; // namespace cudf +} // namespace detail::parquet +} // namespace cudf::io diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 7f3cb95e4b2..b1e2197a868 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -378,9 +378,6 @@ class orc_reader_options_builder { * auto result = cudf::io::read_orc(options); * @endcode * - * Note: Support for reading files with struct columns is currently experimental, the output may not - * be as reliable as reading for other datatypes. - * * @param options Settings for controlling reading behavior * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata. @@ -783,9 +780,6 @@ class orc_writer_options_builder { * cudf::io::write_orc(options); * @endcode * - * Note: Support for writing tables with struct columns is currently experimental, the output may - * not be as reliable as writing for other datatypes. - * * @param options Settings for controlling reading behavior * @param mr Device memory resource to use for device memory allocation */ diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index d974eaa103a..6ef7ea49c59 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -21,6 +21,7 @@ #pragma once +#include #include #include @@ -180,7 +181,7 @@ struct column_statistics { * * @param detail_statistics The statistics to initialize the object with */ - column_statistics(cudf::io::orc::column_statistics&& detail_statistics); + column_statistics(orc::column_statistics&& detail_statistics); }; /** @@ -207,5 +208,166 @@ struct parsed_orc_statistics { */ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info); +/** + * @brief Schema of an ORC column, including the nested columns. + */ +struct orc_column_schema { + public: + /** + * @brief constructor + * + * @param name column name + * @param type ORC type + * @param children child columns (empty for non-nested types) + */ + orc_column_schema(std::string_view name, + orc::TypeKind type, + std::vector children) + : _name{name}, _type_kind{type}, _children{std::move(children)} + { + } + + /** + * @brief Returns ORC column name; can be empty. + * + * @return Column name + */ + [[nodiscard]] auto name() const { return _name; } + + /** + * @brief Returns ORC type of the column. + * + * @return Column ORC type + */ + [[nodiscard]] auto type_kind() const { return _type_kind; } + + /** + * @brief Returns schemas of all child columns. + * + * @return Children schemas + */ + [[nodiscard]] auto const& children() const& { return _children; } + + /** @copydoc children + * Children array is moved out of the object (rvalues only). + * + */ + [[nodiscard]] auto children() && { return std::move(_children); } + + /** + * @brief Returns schema of the child with the given index. + * + * @param idx child index + * + * @return Child schema + */ + [[nodiscard]] auto const& child(int idx) const& { return children().at(idx); } + + /** @copydoc child + * Child is moved out of the object (rvalues only). + * + */ + [[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); } + + /** + * @brief Returns the number of child columns. + * + * @return Children count + */ + [[nodiscard]] auto num_children() const { return children().size(); } + + private: + std::string _name; + orc::TypeKind _type_kind; + std::vector _children; +}; + +/** + * @brief Schema of an ORC file. + */ +struct orc_schema { + public: + /** + * @brief constructor + * + * @param root_column_schema root column + */ + orc_schema(orc_column_schema root_column_schema) : _root{std::move(root_column_schema)} {} + + /** + * @brief Returns the schema of the struct column that contains all columns as fields. + * + * @return Root column schema + */ + [[nodiscard]] auto const& root() const& { return _root; } + + /** @copydoc root + * Root column schema is moved out of the object (rvalues only). + * + */ + [[nodiscard]] auto root() && { return std::move(_root); } + + private: + orc_column_schema _root; +}; + +/** + * @brief Information about content of an ORC file. + */ +class orc_metadata { + public: + /** + * @brief constructor + * + * @param schema ORC schema + * @param num_rows number of rows + * @param num_stripes number of stripes + */ + orc_metadata(orc_schema schema, size_type num_rows, size_type num_stripes) + : _schema{std::move(schema)}, _num_rows{num_rows}, _num_stripes{num_stripes} + { + } + + /** + * @brief Returns the ORC schema. + * + * @return ORC schema + */ + [[nodiscard]] auto const& schema() const { return _schema; } + + ///< Number of rows in the root column; can vary for nested columns + /** + * @brief Returns the number of rows of the root column. + * + * If a file contains list columns, nested columns can have a different number of rows. + * + * @return Number of rows + */ + [[nodiscard]] auto num_rows() const { return _num_rows; } + + /** + * @brief Returns the number of stripes in the file. + * + * @return Number of stripes + */ + [[nodiscard]] auto num_stripes() const { return _num_stripes; } + + private: + orc_schema _schema; + size_type _num_rows; + size_type _num_stripes; +}; + +/** + * @brief Reads file-level and stripe-level statistics of ORC dataset. + * + * @ingroup io_readers + * + * @param src_info Dataset source + * + * @return Column names and decoded ORC statistics + */ +orc_metadata read_orc_metadata(source_info const& src_info); + } // namespace io } // namespace cudf diff --git a/cpp/src/io/orc/orc_common.hpp b/cpp/include/cudf/io/orc_types.hpp similarity index 94% rename from cpp/src/io/orc/orc_common.hpp rename to cpp/include/cudf/io/orc_types.hpp index c2898b362a6..09cae2ef06c 100644 --- a/cpp/src/io/orc/orc_common.hpp +++ b/cpp/include/cudf/io/orc_types.hpp @@ -18,11 +18,7 @@ #include -namespace cudf { -namespace io { -namespace orc { - -static constexpr uint32_t block_header_size = 3; +namespace cudf::io::orc { enum CompressionKind : uint8_t { NONE = 0, @@ -87,6 +83,4 @@ enum ProtofType : uint8_t { INVALID_7 = 7, }; -} // namespace orc -} // namespace io -} // namespace cudf +} // namespace cudf::io::orc diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index ff5b9f5c457..f3facae098d 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -30,8 +30,7 @@ #include #include -namespace cudf { -namespace io { +namespace cudf::io { /** * @addtogroup io_readers * @{ @@ -400,6 +399,74 @@ table_with_metadata read_parquet( parquet_reader_options const& options, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief The chunked parquet reader class to read Parquet file iteratively in to a series of + * tables, chunk by chunk. + * + * This class is designed to address the reading issue when reading very large Parquet files such + * that the sizes of their column exceed the limit that can be stored in cudf column. By reading the + * file content by chunks using this class, each chunk is guaranteed to have its sizes stay within + * the given limit. + */ +class chunked_parquet_reader { + public: + /** + * @brief Default constructor, this should never be used. + * + * This is added just to satisfy cython. + */ + chunked_parquet_reader() = default; + + /** + * @brief Constructor for chunked reader. + * + * This constructor requires the same `parquet_reader_option` parameter as in + * `cudf::read_parquet()`, and an additional parameter to specify the size byte limit of the + * output table for each reading. + * + * @param chunk_read_limit Limit on total number of bytes to be returned per read, + * or `0` if there is no limit + * @param options The options used to read Parquet file + * @param mr Device memory resource to use for device memory allocation + */ + chunked_parquet_reader( + std::size_t chunk_read_limit, + parquet_reader_options const& options, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + + /** + * @brief Destructor, destroying the internal reader instance. + * + * Since the declaration of the internal `reader` object does not exist in this header, this + * destructor needs to be defined in a separate source file which can access to that object's + * declaration. + */ + ~chunked_parquet_reader(); + + /** + * @brief Check if there is any data in the given file has not yet read. + * + * @return A boolean value indicating if there is any data left to read + */ + [[nodiscard]] bool has_next() const; + + /** + * @brief Read a chunk of rows in the given Parquet file. + * + * The sequence of returned tables, if concatenated by their order, guarantees to form a complete + * dataset as reading the entire given file at once. + * + * An empty table will be returned if the given file is empty, or all the data in the file has + * been read and returned by the previous calls. + * + * @return An output `cudf::table` along with its metadata + */ + [[nodiscard]] table_with_metadata read_chunk() const; + + private: + std::unique_ptr reader; +}; + /** @} */ // end of group /** * @addtogroup io_writers @@ -1452,5 +1519,5 @@ class parquet_chunked_writer { }; /** @} */ // end of group -} // namespace io -} // namespace cudf + +} // namespace cudf::io diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp index 6f94fb170a8..f5230863f17 100644 --- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp +++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -25,6 +26,14 @@ namespace cudf::io::text { +/** + * @brief Creates a data source capable of producing device-buffered views of a datasource. + * @param data the datasource to be exposed as a data chunk source + * @return the data chunk source for the provided datasource. It must not outlive the datasource + * used to construct it. + */ +std::unique_ptr make_source(datasource& data); + /** * @brief Creates a data source capable of producing device-buffered views of the given string. * @param data the host data to be exposed as a data chunk source. Its lifetime must be at least as diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp new file mode 100644 index 00000000000..627df5f358a --- /dev/null +++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include +#include +#include +#include + +namespace cudf::io::text::detail::bgzip { + +struct header { + int block_size; + int extra_length; + [[nodiscard]] int data_size() const { return block_size - extra_length - 20; } +}; + +struct footer { + uint32_t crc; + uint32_t decompressed_size; +}; + +/** + * @brief Reads the full BGZIP header from the given input stream. Afterwards, the stream position + * is at the first data byte. + * + * @param input_stream The input stream + * @return The header storing the compressed size and extra subfield length + */ +header read_header(std::istream& input_stream); + +/** + * @brief Reads the full BGZIP footer from the given input stream. Afterwards, the stream position + * is after the last footer byte. + * + * @param input_stream The input stream + * @return The footer storing uncompressed size and CRC32 + */ +footer read_footer(std::istream& input_stream); + +/** + * @brief Writes a header for data of the given compressed size to the given stream. + * + * @param output_stream The output stream + * @param compressed_size The size of the compressed data + * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the + * BGZIP block size subfield + * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield + */ +void write_header(std::ostream& output_stream, + uint16_t compressed_size, + host_span pre_size_subfields, + host_span post_size_subfields); + +/** + * @brief Writes a footer for the given uncompressed data to the given stream. + * + * @param output_stream The output stream + * @param data The data for which uncompressed size and CRC32 will be computed and written + */ +void write_footer(std::ostream& output_stream, host_span data); + +/** + * @brief Writes the given data to the given stream as an uncompressed deflate block with BZGIP + * header and footer. + * + * @param output_stream The output stream + * @param data The uncompressed data + * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the + * BGZIP block size subfield + * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield + */ +void write_uncompressed_block(std::ostream& output_stream, + host_span data, + host_span pre_size_subfields = {}, + host_span post_size_subfields = {}); + +/** + * @brief Writes the given data to the given stream as a compressed deflate block with BZGIP + * header and footer. + * + * @param output_stream The output stream + * @param data The uncompressed data + * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the + * BGZIP block size subfield + * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield + */ +void write_compressed_block(std::ostream& output_stream, + host_span data, + host_span pre_size_subfields = {}, + host_span post_size_subfields = {}); + +} // namespace cudf::io::text::detail::bgzip diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp index abb966a55bf..a7edc9be0e4 100644 --- a/cpp/include/cudf/io/text/multibyte_split.hpp +++ b/cpp/include/cudf/io/text/multibyte_split.hpp @@ -30,11 +30,25 @@ namespace cudf { namespace io { namespace text { +/** + * @brief Parsing options for multibyte_split. + */ +struct parse_options { + /** + * @brief Only rows starting inside this byte range will be part of the output column. + */ + byte_range_info byte_range = create_byte_range_info_max(); + /** + * @brief Whether delimiters at the end of rows should be stripped from the output column + */ + bool strip_delimiters = false; +}; + /** * @brief Splits the source text into a strings column using a multiple byte delimiter. * - * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets - * of delimiters which begin within the range. If thinking in terms of "records", where each + * Providing a byte range allows multibyte_split to read a file partially, only returning the + * offsets of delimiters which begin within the range. If thinking in terms of "records", where each * delimiter dictates the end of a record, all records which begin within the byte range provided * will be returned, including any record which may begin in the range but end outside of the * range. Records which begin outside of the range will ignored, even if those records end inside @@ -63,7 +77,7 @@ namespace text { * * @param source The source string * @param delimiter UTF-8 encoded string for which to find offsets in the source - * @param byte_range range in which to consider offsets relevant + * @param options the parsing options to use (including byte range) * @param mr Memory resource to use for the device memory allocation * @return The strings found by splitting the source by the delimiter within the relevant byte * range. @@ -71,8 +85,14 @@ namespace text { std::unique_ptr multibyte_split( data_chunk_source const& source, std::string const& delimiter, - std::optional byte_range = std::nullopt, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + parse_options options = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +std::unique_ptr multibyte_split( + data_chunk_source const& source, + std::string const& delimiter, + std::optional byte_range, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr multibyte_split(data_chunk_source const& source, std::string const& delimiter, diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index 838151fbaf9..9c47ed9ea69 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -419,7 +419,7 @@ class column_in_metadata { * @param name Name of the column * @return this for chaining */ - column_in_metadata& set_name(std::string const& name) + column_in_metadata& set_name(std::string const& name) noexcept { _name = name; return *this; @@ -433,7 +433,7 @@ class column_in_metadata { * @param nullable Whether this column is nullable * @return this for chaining */ - column_in_metadata& set_nullability(bool nullable) + column_in_metadata& set_nullability(bool nullable) noexcept { _nullable = nullable; return *this; @@ -446,7 +446,7 @@ class column_in_metadata { * * @return this for chaining */ - column_in_metadata& set_list_column_as_map() + column_in_metadata& set_list_column_as_map() noexcept { _list_column_is_map = true; return *this; @@ -460,7 +460,7 @@ class column_in_metadata { * @param req True = use int96 physical type. False = use int64 physical type * @return this for chaining */ - column_in_metadata& set_int96_timestamps(bool req) + column_in_metadata& set_int96_timestamps(bool req) noexcept { _use_int96_timestamp = req; return *this; @@ -473,7 +473,7 @@ class column_in_metadata { * @param precision The integer precision to set for this decimal column * @return this for chaining */ - column_in_metadata& set_decimal_precision(uint8_t precision) + column_in_metadata& set_decimal_precision(uint8_t precision) noexcept { _decimal_precision = precision; return *this; @@ -485,7 +485,7 @@ class column_in_metadata { * @param field_id The parquet field id to set * @return this for chaining */ - column_in_metadata& set_parquet_field_id(int32_t field_id) + column_in_metadata& set_parquet_field_id(int32_t field_id) noexcept { _parquet_field_id = field_id; return *this; @@ -499,7 +499,7 @@ class column_in_metadata { * @param binary True = use binary data type. False = use string data type * @return this for chaining */ - column_in_metadata& set_output_as_binary(bool binary) + column_in_metadata& set_output_as_binary(bool binary) noexcept { _output_as_binary = binary; return *this; @@ -511,7 +511,7 @@ class column_in_metadata { * @param i Index of the child to get * @return this for chaining */ - column_in_metadata& child(size_type i) { return children[i]; } + column_in_metadata& child(size_type i) noexcept { return children[i]; } /** * @brief Get const reference to a child of this column @@ -519,21 +519,21 @@ class column_in_metadata { * @param i Index of the child to get * @return this for chaining */ - [[nodiscard]] column_in_metadata const& child(size_type i) const { return children[i]; } + [[nodiscard]] column_in_metadata const& child(size_type i) const noexcept { return children[i]; } /** * @brief Get the name of this column * * @return The name of this column */ - [[nodiscard]] std::string get_name() const { return _name; } + [[nodiscard]] std::string get_name() const noexcept { return _name; } /** * @brief Get whether nullability has been explicitly set for this column. * * @return Boolean indicating whether nullability has been explicitly set for this column */ - [[nodiscard]] bool is_nullability_defined() const { return _nullable.has_value(); } + [[nodiscard]] bool is_nullability_defined() const noexcept { return _nullable.has_value(); } /** * @brief Gets the explicitly set nullability for this column. @@ -549,7 +549,7 @@ class column_in_metadata { * * @return Boolean indicating whether this column is to be encoded as a map */ - [[nodiscard]] bool is_map() const { return _list_column_is_map; } + [[nodiscard]] bool is_map() const noexcept { return _list_column_is_map; } /** * @brief Get whether to encode this timestamp column using deprecated int96 physical type @@ -557,14 +557,17 @@ class column_in_metadata { * @return Boolean indicating whether to encode this timestamp column using deprecated int96 * physical type */ - [[nodiscard]] bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; } + [[nodiscard]] bool is_enabled_int96_timestamps() const noexcept { return _use_int96_timestamp; } /** * @brief Get whether precision has been set for this decimal column * * @return Boolean indicating whether precision has been set for this decimal column */ - [[nodiscard]] bool is_decimal_precision_set() const { return _decimal_precision.has_value(); } + [[nodiscard]] bool is_decimal_precision_set() const noexcept + { + return _decimal_precision.has_value(); + } /** * @brief Get the decimal precision that was set for this column. @@ -580,7 +583,10 @@ class column_in_metadata { * * @return Boolean indicating whether parquet field id has been set for this column */ - [[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); } + [[nodiscard]] bool is_parquet_field_id_set() const noexcept + { + return _parquet_field_id.has_value(); + } /** * @brief Get the parquet field id that was set for this column. @@ -596,14 +602,14 @@ class column_in_metadata { * * @return The number of children of this column */ - [[nodiscard]] size_type num_children() const { return children.size(); } + [[nodiscard]] size_type num_children() const noexcept { return children.size(); } /** * @brief Get whether to encode this column as binary or string data * * @return Boolean indicating whether to encode this column as binary data */ - [[nodiscard]] bool is_enabled_output_as_binary() const { return _output_as_binary; } + [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; } }; /** diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index bc3bfef3a7d..b613a661d95 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -287,7 +287,7 @@ class hash_join { */ hash_join(cudf::table_view const& build, null_equality compare_nulls, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * Returns the row indices that can be used to construct the result of performing @@ -308,7 +308,7 @@ class hash_join { std::unique_ptr>> inner_join(cudf::table_view const& probe, std::optional output_size = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** @@ -330,7 +330,7 @@ class hash_join { std::unique_ptr>> left_join(cudf::table_view const& probe, std::optional output_size = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** @@ -352,7 +352,7 @@ class hash_join { std::unique_ptr>> full_join(cudf::table_view const& probe, std::optional output_size = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; /** @@ -366,7 +366,7 @@ class hash_join { * `build` and `probe` as the the join keys . */ [[nodiscard]] std::size_t inner_join_size( - cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::default_stream_value) const; + cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * Returns the exact number of matches (rows) when performing a left join with the specified probe @@ -379,7 +379,7 @@ class hash_join { * and `probe` as the the join keys . */ [[nodiscard]] std::size_t left_join_size( - cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::default_stream_value) const; + cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * Returns the exact number of matches (rows) when performing a full join with the specified probe @@ -395,7 +395,7 @@ class hash_join { */ std::size_t full_join_size( cudf::table_view const& probe, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; private: diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp index e2e17579c85..5a8b4bc3bf3 100644 --- a/cpp/include/cudf/lists/detail/concatenate.hpp +++ b/cpp/include/cudf/lists/detail/concatenate.hpp @@ -45,7 +45,7 @@ namespace detail { */ std::unique_ptr concatenate( host_span columns, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index 9cbe9582456..48c0ed8f6e9 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -89,7 +89,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the compacted outgoing offsets. auto count_iter = thrust::make_counting_iterator(0); thrust::transform_exclusive_scan( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), count_iter, count_iter + offset_count, dst_offsets_v.begin(), @@ -125,7 +125,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the base offsets rmm::device_uvector base_offsets = rmm::device_uvector(output_count, stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), gather_map, gather_map + output_count, base_offsets.data(), @@ -320,8 +320,9 @@ std::unique_ptr gather_list_leaf( std::unique_ptr segmented_gather( lists_column_view const& source_column, lists_column_view const& gather_map_list, - out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK, - rmm::cuda_stream_view stream = cudf::default_stream_value, + out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK, + // Move before bounds_policy? + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh index 2e60df4a5ae..f4106fb5cdf 100644 --- a/cpp/include/cudf/lists/detail/scatter.cuh +++ b/cpp/include/cudf/lists/detail/scatter.cuh @@ -58,7 +58,7 @@ rmm::device_uvector list_vector_from_column( auto vector = rmm::device_uvector(n_rows, stream, mr); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), index_begin, index_end, vector.begin(), @@ -96,7 +96,7 @@ std::unique_ptr scatter_impl( MapIterator scatter_map_end, column_view const& source, column_view const& target, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types."); @@ -104,7 +104,7 @@ std::unique_ptr scatter_impl( auto const child_column_type = lists_column_view(target).child().type(); // Scatter. - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), source_vector.begin(), source_vector.end(), scatter_map_begin, @@ -169,7 +169,7 @@ std::unique_ptr scatter( MapIterator scatter_map_begin, MapIterator scatter_map_end, column_view const& target, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto const num_rows = target.size(); @@ -226,7 +226,7 @@ std::unique_ptr scatter( MapIterator scatter_map_begin, MapIterator scatter_map_end, column_view const& target, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto const num_rows = target.size(); @@ -239,7 +239,7 @@ std::unique_ptr scatter( : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr); auto offset_column = make_numeric_column( data_type{type_to_id()}, 2, mask_state::UNALLOCATED, stream, mr); - thrust::sequence(rmm::exec_policy(stream), + thrust::sequence(rmm::exec_policy_nosync(stream), offset_column->mutable_view().begin(), offset_column->mutable_view().end(), 0, diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp index 2b40a875cc9..a6eacb97e91 100644 --- a/cpp/include/cudf/lists/lists_column_factories.hpp +++ b/cpp/include/cudf/lists/lists_column_factories.hpp @@ -38,7 +38,7 @@ namespace detail { std::unique_ptr make_lists_column_from_scalar( list_scalar const& value, size_type size, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index 6e9f571cc9d..2c91bdf64f5 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -98,7 +98,7 @@ std::pair, std::vector> hash_partition( int num_partitions, hash_id hash_function = hash_id::HASH_MURMUR3, uint32_t seed = DEFAULT_HASH_SEED, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp index 531c7e3477d..1f3c26fa077 100644 --- a/cpp/include/cudf/quantiles.hpp +++ b/cpp/include/cudf/quantiles.hpp @@ -18,7 +18,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp index 083892aa856..7aa7ada6896 100644 --- a/cpp/include/cudf/reduction.hpp +++ b/cpp/include/cudf/reduction.hpp @@ -72,7 +72,7 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE }; */ std::unique_ptr reduce( column_view const& col, - std::unique_ptr const& agg, + reduce_aggregation const& agg, data_type output_dtype, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -89,7 +89,7 @@ std::unique_ptr reduce( */ std::unique_ptr reduce( column_view const& col, - std::unique_ptr const& agg, + reduce_aggregation const& agg, data_type output_dtype, std::optional> init, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); @@ -187,7 +187,7 @@ std::unique_ptr segmented_reduce( */ std::unique_ptr scan( const column_view& input, - std::unique_ptr const& agg, + scan_aggregation const& agg, scan_type inclusive, null_policy null_handling = null_policy::EXCLUDE, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index 9b9c73071af..6161639a6fb 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -64,7 +64,7 @@ class scalar { * @param is_valid true: set the value to valid. false: set it to null. * @param stream CUDA stream used for device memory operations. */ - void set_valid_async(bool is_valid, rmm::cuda_stream_view stream = cudf::default_stream_value); + void set_valid_async(bool is_valid, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Indicates whether the scalar contains a valid value. @@ -76,7 +76,7 @@ class scalar { * @return true Value is valid * @return false Value is invalid/null */ - [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream = cudf::default_stream_value) const; + [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Returns a raw pointer to the validity bool in device memory. @@ -112,7 +112,7 @@ class scalar { * @param mr Device memory resource to use for device memory allocation. */ scalar(scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -128,7 +128,7 @@ class scalar { */ scalar(data_type type, bool is_valid = false, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); }; @@ -164,7 +164,7 @@ class fixed_width_scalar : public scalar { * @param mr Device memory resource to use for device memory allocation. */ fixed_width_scalar(fixed_width_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -173,7 +173,7 @@ class fixed_width_scalar : public scalar { * @param value New value of scalar. * @param stream CUDA stream used for device memory operations. */ - void set_value(T value, rmm::cuda_stream_view stream = cudf::default_stream_value); + void set_value(T value, rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Explicit conversion operator to get the value of the scalar on the host. @@ -186,7 +186,7 @@ class fixed_width_scalar : public scalar { * @param stream CUDA stream used for device memory operations. * @return Value of the scalar */ - T value(rmm::cuda_stream_view stream = cudf::default_stream_value) const; + T value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Returns a raw pointer to the value in device memory. @@ -215,7 +215,7 @@ class fixed_width_scalar : public scalar { */ fixed_width_scalar(T value, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -228,7 +228,7 @@ class fixed_width_scalar : public scalar { */ fixed_width_scalar(rmm::device_scalar&& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); }; @@ -264,7 +264,7 @@ class numeric_scalar : public detail::fixed_width_scalar { * @param mr Device memory resource to use for device memory allocation. */ numeric_scalar(numeric_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -277,7 +277,7 @@ class numeric_scalar : public detail::fixed_width_scalar { */ numeric_scalar(T value, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -290,7 +290,7 @@ class numeric_scalar : public detail::fixed_width_scalar { */ numeric_scalar(rmm::device_scalar&& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); }; @@ -327,7 +327,7 @@ class fixed_point_scalar : public scalar { * @param mr Device memory resource to use for device memory allocation. */ fixed_point_scalar(fixed_point_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -342,7 +342,7 @@ class fixed_point_scalar : public scalar { fixed_point_scalar(rep_type value, numeric::scale_type scale, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -355,7 +355,7 @@ class fixed_point_scalar : public scalar { */ fixed_point_scalar(rep_type value, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -368,7 +368,7 @@ class fixed_point_scalar : public scalar { */ fixed_point_scalar(T value, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -383,7 +383,7 @@ class fixed_point_scalar : public scalar { fixed_point_scalar(rmm::device_scalar&& data, numeric::scale_type scale, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -392,7 +392,7 @@ class fixed_point_scalar : public scalar { * @param stream CUDA stream used for device memory operations. * @return The value of the scalar */ - rep_type value(rmm::cuda_stream_view stream = cudf::default_stream_value) const; + rep_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Get the decimal32, decimal64 or decimal128. @@ -400,7 +400,7 @@ class fixed_point_scalar : public scalar { * @param stream CUDA stream used for device memory operations. * @return The decimal32, decimal64 or decimal128 value */ - T fixed_point_value(rmm::cuda_stream_view stream = cudf::default_stream_value) const; + T fixed_point_value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Explicit conversion operator to get the value of the scalar on the host. @@ -451,7 +451,7 @@ class string_scalar : public scalar { * @param mr Device memory resource to use for device memory allocation. */ string_scalar(string_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -464,7 +464,7 @@ class string_scalar : public scalar { */ string_scalar(std::string const& string, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -479,7 +479,7 @@ class string_scalar : public scalar { */ string_scalar(value_type const& source, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -494,7 +494,7 @@ class string_scalar : public scalar { */ string_scalar(rmm::device_scalar& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -510,7 +510,7 @@ class string_scalar : public scalar { */ string_scalar(rmm::device_buffer&& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -525,7 +525,7 @@ class string_scalar : public scalar { * @return The value of the scalar in a host std::string */ [[nodiscard]] std::string to_string( - rmm::cuda_stream_view stream = cudf::default_stream_value) const; + rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Get the value of the scalar as a string_view. @@ -533,7 +533,7 @@ class string_scalar : public scalar { * @param stream CUDA stream used for device memory operations. * @return The value of the scalar as a string_view */ - [[nodiscard]] value_type value(rmm::cuda_stream_view stream = cudf::default_stream_value) const; + [[nodiscard]] value_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const; /** * @brief Returns the size of the string in bytes. @@ -582,7 +582,7 @@ class chrono_scalar : public detail::fixed_width_scalar { * @param mr Device memory resource to use for device memory allocation. */ chrono_scalar(chrono_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -595,7 +595,7 @@ class chrono_scalar : public detail::fixed_width_scalar { */ chrono_scalar(T value, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -608,7 +608,7 @@ class chrono_scalar : public detail::fixed_width_scalar { */ chrono_scalar(rmm::device_scalar&& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); }; @@ -641,7 +641,7 @@ class timestamp_scalar : public chrono_scalar { * @param mr Device memory resource to use for device memory allocation. */ timestamp_scalar(timestamp_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -657,7 +657,7 @@ class timestamp_scalar : public chrono_scalar { template timestamp_scalar(Duration2 const& value, bool is_valid, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -696,7 +696,7 @@ class duration_scalar : public chrono_scalar { * @param mr Device memory resource to use for device memory allocation. */ duration_scalar(duration_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -709,7 +709,7 @@ class duration_scalar : public chrono_scalar { */ duration_scalar(rep_type value, bool is_valid, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -744,7 +744,7 @@ class list_scalar : public scalar { * @param mr Device memory resource to use for device memory allocation. */ list_scalar(list_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -759,7 +759,7 @@ class list_scalar : public scalar { */ list_scalar(cudf::column_view const& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -772,7 +772,7 @@ class list_scalar : public scalar { */ list_scalar(cudf::column&& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -809,7 +809,7 @@ class struct_scalar : public scalar { * @param mr Device memory resource to use for device memory allocation. */ struct_scalar(struct_scalar const& other, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -824,7 +824,7 @@ class struct_scalar : public scalar { */ struct_scalar(table_view const& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -839,7 +839,7 @@ class struct_scalar : public scalar { */ struct_scalar(host_span data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -855,7 +855,7 @@ class struct_scalar : public scalar { */ struct_scalar(table&& data, bool is_valid = true, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp index b2b52ddc488..78b6c4fd0e9 100644 --- a/cpp/include/cudf/scalar/scalar_factories.hpp +++ b/cpp/include/cudf/scalar/scalar_factories.hpp @@ -43,7 +43,7 @@ namespace cudf { */ std::unique_ptr make_numeric_scalar( data_type type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -60,7 +60,7 @@ std::unique_ptr make_numeric_scalar( */ std::unique_ptr make_timestamp_scalar( data_type type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -77,7 +77,7 @@ std::unique_ptr make_timestamp_scalar( */ std::unique_ptr make_duration_scalar( data_type type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -94,7 +94,7 @@ std::unique_ptr make_duration_scalar( */ std::unique_ptr make_fixed_width_scalar( data_type type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -111,7 +111,7 @@ std::unique_ptr make_fixed_width_scalar( */ std::unique_ptr make_string_scalar( std::string const& string, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -126,7 +126,7 @@ std::unique_ptr make_string_scalar( */ std::unique_ptr make_default_constructed_scalar( data_type type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -141,7 +141,7 @@ std::unique_ptr make_default_constructed_scalar( */ std::unique_ptr make_empty_scalar_like( column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -156,7 +156,7 @@ std::unique_ptr make_empty_scalar_like( template std::unique_ptr make_fixed_width_scalar( T value, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { return std::make_unique>(value, true, stream, mr); @@ -176,7 +176,7 @@ template std::unique_ptr make_fixed_point_scalar( typename T::rep value, numeric::scale_type scale, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { return std::make_unique>(value, scale, true, stream, mr); @@ -192,7 +192,7 @@ std::unique_ptr make_fixed_point_scalar( */ std::unique_ptr make_list_scalar( column_view elements, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -207,7 +207,7 @@ std::unique_ptr make_list_scalar( */ std::unique_ptr make_struct_scalar( table_view const& data, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -222,7 +222,7 @@ std::unique_ptr make_struct_scalar( */ std::unique_ptr make_struct_scalar( host_span data, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index cf21da1b030..f43089210fd 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -207,9 +207,31 @@ std::unique_ptr rank( /** * @brief Returns sorted order after sorting each segment in the table. * - * If segment_offsets contains values larger than number of rows, behavior is undefined. + * If segment_offsets contains values larger than the number of rows, the behavior is undefined. * @throws cudf::logic_error if `segment_offsets` is not `size_type` column. * + * @code{.pseudo} + * Example: + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * offsets = {0, 3, 7, 10} + * result = cudf::segmented_sorted_order(keys, offsets); + * result is { 2,1,0, 6,5,4,3, 9,8,7 } + * @endcode + * + * If segment_offsets is empty or contains a single index, no values are sorted + * and the result is a sequence of integers from 0 to keys.size()-1. + * + * The segment_offsets are not required to include all indices. Any indices + * outside the specified segments will not be sorted. + * + * @code{.pseudo} + * Example: (offsets do not cover all indices) + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * offsets = {3, 7} + * result = cudf::segmented_sorted_order(keys, offsets); + * result is { 0,1,2, 6,5,4,3, 7,8,9 } + * @endcode + * * @param keys The table that determines the ordering of elements in each segment * @param segment_offsets The column of `size_type` type containing start offset index for each * contiguous segment. @@ -246,10 +268,34 @@ std::unique_ptr stable_segmented_sorted_order( /** * @brief Performs a lexicographic segmented sort of a table * - * If segment_offsets contains values larger than number of rows, behavior is undefined. + * If segment_offsets contains values larger than the number of rows, the behavior is undefined. * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`. * @throws cudf::logic_error if `segment_offsets` is not `size_type` column. * + * @code{.pseudo} + * Example: + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} } + * offsets = {0, 3, 7, 10} + * result = cudf::segmented_sort_by_key(keys, values, offsets); + * result is { 'c','b','a', 'g','f','e','d', 'j','i','h' } + * @endcode + * + * If segment_offsets is empty or contains a single index, no values are sorted + * and the result is a copy of the values. + * + * The segment_offsets are not required to include all indices. Any indices + * outside the specified segments will not be sorted. + * + * @code{.pseudo} + * Example: (offsets do not cover all indices) + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} } + * offsets = {3, 7} + * result = cudf::segmented_sort_by_key(keys, values, offsets); + * result is { 'a','b','c', 'g','f','e','d', 'h','i','j' } + * @endcode + * * @param values The table to reorder * @param keys The table that determines the ordering of elements in each segment * @param segment_offsets The column of `size_type` type containing start offset index for each diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp index d95dc2c418c..1718d205871 100644 --- a/cpp/include/cudf/strings/contains.hpp +++ b/cpp/include/cudf/strings/contains.hpp @@ -24,6 +24,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_contains * @{ @@ -58,6 +61,32 @@ std::unique_ptr contains_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying rows which + * match the given regex_program object + * + * @code{.pseudo} + * Example: + * s = ["abc", "123", "def456"] + * p = regex_program::create("\\d+") + * r = contains_re(s, p) + * r is now [false, true, true] + * @endcode + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string + */ +std::unique_ptr contains_re( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a boolean column identifying rows which * matching the given regex pattern but only at the beginning the string. @@ -85,6 +114,32 @@ std::unique_ptr matches_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a boolean column identifying rows which + * matching the given regex_program object but only at the beginning the string. + * + * @code{.pseudo} + * Example: + * s = ["abc", "123", "def456"] + * p = regex_program::create("\\d+") + * r = matches_re(s, p) + * r is now [false, true, false] + * @endcode + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of boolean results for each string + */ +std::unique_ptr matches_re( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns the number of times the given regex pattern * matches in each string. @@ -112,6 +167,32 @@ std::unique_ptr count_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns the number of times the given regex_program's pattern + * matches in each string + * + * @code{.pseudo} + * Example: + * s = ["abc", "123", "def45"] + * p = regex_program::create("\\d") + * r = count_re(s, p) + * r is now [0, 3, 2] + * @endcode + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New INT32 column with counts for each string + */ +std::unique_ptr count_re( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a boolean column identifying rows which * match the given like pattern. diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp index 4ea7e3ee952..275b7223a3b 100644 --- a/cpp/include/cudf/strings/detail/char_tables.hpp +++ b/cpp/include/cudf/strings/detail/char_tables.hpp @@ -46,6 +46,7 @@ constexpr uint8_t IS_LOWER(uint8_t x) { return ((x) & (1 << 6)); } constexpr uint8_t IS_SPECIAL(uint8_t x) { return ((x) & (1 << 7)); } constexpr uint8_t IS_ALPHANUM(uint8_t x) { return ((x) & (0x0F)); } constexpr uint8_t IS_UPPER_OR_LOWER(uint8_t x) { return ((x) & ((1 << 5) | (1 << 6))); } +constexpr uint8_t ALL_FLAGS = 0xFF; // Type for the character cases table. using character_cases_table_type = uint16_t; diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index 7df3a4ce324..3b8ed0f4e0d 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -34,13 +34,12 @@ namespace detail { * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr concatenate( - table_view const& strings_columns, - string_scalar const& separator, - string_scalar const& narep, - separator_on_nulls separate_nulls = separator_on_nulls::YES, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr concatenate(table_view const& strings_columns, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar @@ -48,12 +47,11 @@ std::unique_ptr concatenate( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr join_strings( - strings_column_view const& strings, - string_scalar const& separator, - string_scalar const& narep, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr join_strings(strings_column_view const& strings, + string_scalar const& separator, + string_scalar const& narep, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp index 0df86db60b6..511e240886a 100644 --- a/cpp/include/cudf/strings/detail/concatenate.hpp +++ b/cpp/include/cudf/strings/detail/concatenate.hpp @@ -42,10 +42,9 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ -std::unique_ptr concatenate( - host_span columns, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr concatenate(host_span columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 79cec779e02..374c3b2cf68 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -56,13 +56,12 @@ namespace detail { * @return New strings column. */ template -std::unique_ptr copy_if_else( - StringIterLeft lhs_begin, - StringIterLeft lhs_end, - StringIterRight rhs_begin, - Filter filter_fn, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr copy_if_else(StringIterLeft lhs_begin, + StringIterLeft lhs_end, + StringIterRight rhs_begin, + Filter filter_fn, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = std::distance(lhs_begin, lhs_end); if (strings_count == 0) return make_empty_column(type_id::STRING); diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index e83f6dc0005..ee09ce9a7a9 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -99,14 +99,13 @@ namespace detail { * @return std::unique_ptr The result target column */ template -std::unique_ptr copy_range( - SourceValueIterator source_value_begin, - SourceValidityIterator source_validity_begin, - strings_column_view const& target, - size_type target_begin, - size_type target_end, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr copy_range(SourceValueIterator source_value_begin, + SourceValidityIterator source_validity_begin, + strings_column_view const& target, + size_type target_begin, + size_type target_end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS( (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()), diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp index 56e9c35c889..7e82ad4c679 100644 --- a/cpp/include/cudf/strings/detail/copying.hpp +++ b/cpp/include/cudf/strings/detail/copying.hpp @@ -49,12 +49,11 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column of size (end-start)/step. */ -std::unique_ptr copy_slice( - strings_column_view const& strings, - size_type start, - size_type end = -1, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr copy_slice(strings_column_view const& strings, + size_type start, + size_type end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Returns a new strings column created by shifting the rows by a specified offset. diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp index e8f9c9ca438..43e3f6198f3 100644 --- a/cpp/include/cudf/strings/detail/fill.hpp +++ b/cpp/include/cudf/strings/detail/fill.hpp @@ -42,13 +42,12 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ -std::unique_ptr fill( - strings_column_view const& strings, - size_type begin, - size_type end, - string_scalar const& value, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr fill(strings_column_view const& strings, + size_type begin, + size_type end, + string_scalar const& value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index dfc8f0dacc5..28b98eac3b5 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -288,12 +288,11 @@ std::unique_ptr gather_chars(StringIterator strings_begin, * @return New strings column containing the gathered strings. */ template -std::unique_ptr gather( - strings_column_view const& strings, - MapIterator begin, - MapIterator end, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr gather(strings_column_view const& strings, + MapIterator begin, + MapIterator end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const output_count = std::distance(begin, end); auto const strings_count = strings.size(); @@ -306,7 +305,7 @@ std::unique_ptr gather( auto const d_in_offsets = (strings_count > 0) ? strings.offsets_begin() : nullptr; auto const d_strings = column_device_view::create(strings.parent(), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), begin, end, d_out_offsets, @@ -318,7 +317,7 @@ std::unique_ptr gather( // check total size is not too large size_t const total_bytes = thrust::transform_reduce( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), d_out_offsets, d_out_offsets + output_count, [] __device__(auto size) { return static_cast(size); }, @@ -328,8 +327,10 @@ std::unique_ptr gather( "total size of output strings is too large for a cudf column"); // In-place convert output sizes into offsets - thrust::exclusive_scan( - rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + d_out_offsets, + d_out_offsets + output_count + 1, + d_out_offsets); // build chars column cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); @@ -372,13 +373,12 @@ std::unique_ptr gather( * @return New strings column containing the gathered strings. */ template -std::unique_ptr gather( - strings_column_view const& strings, - MapIterator begin, - MapIterator end, - bool nullify_out_of_bounds, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr gather(strings_column_view const& strings, + MapIterator begin, + MapIterator end, + bool nullify_out_of_bounds, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (nullify_out_of_bounds) return gather(strings, begin, end, stream, mr); return gather(strings, begin, end, stream, mr); diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp index 90188910c7d..0fb06d36570 100644 --- a/cpp/include/cudf/strings/detail/json.hpp +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -16,6 +16,8 @@ #pragma once +#include +#include #include #include @@ -30,12 +32,11 @@ namespace detail { * * @param stream CUDA stream used for device memory operations and kernel launches */ -std::unique_ptr get_json_object( - cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, - get_json_object_options options, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + cudf::strings::get_json_object_options options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/pad_impl.cuh b/cpp/include/cudf/strings/detail/pad_impl.cuh new file mode 100644 index 00000000000..648c240bfbc --- /dev/null +++ b/cpp/include/cudf/strings/detail/pad_impl.cuh @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @brief Return the size in bytes of padding d_str to width characters using a fill character + * with byte length of fill_char_size + * + * Pad does not perform truncation. That is, If `d_str.length() > width` then `d_str.size_bytes()` + * is returned. + * + * @param d_str String to pad + * @param width Number of characters for the padded string result + * @param fill_char_size Size of the fill character in bytes + * @return The number of bytes required for the pad + */ +__device__ size_type compute_padded_size(string_view d_str, + size_type width, + size_type fill_char_size) +{ + auto const length = d_str.length(); + auto bytes = d_str.size_bytes(); + if (width > length) // no truncating; + bytes += fill_char_size * (width - length); // add padding + return bytes; +} + +/** + * @brief Pad d_str with fill_char into output up to width characters + * + * Pad does not perform truncation. That is, If `d_str.length() > width` then + * then d_str is copied into output. + * + * @tparam side Specifies where fill_char is added to d_str + * @param d_str String to pad + * @param width Number of characters for the padded string result + * @param fill_char Size of the fill character in bytes + * @param output Device memory to copy the padded string into + */ +template +__device__ void pad_impl(cudf::string_view d_str, + cudf::size_type width, + cudf::char_utf8 fill_char, + char* output) +{ + auto length = d_str.length(); + if constexpr (side == side_type::LEFT) { + while (length++ < width) { + output += from_char_utf8(fill_char, output); + } + copy_string(output, d_str); + } + if constexpr (side == side_type::RIGHT) { + output = copy_string(output, d_str); + while (length++ < width) { + output += from_char_utf8(fill_char, output); + } + } + if constexpr (side == side_type::BOTH) { + auto const pad_size = width - length; + // an odd width will right-justify + auto right_pad = (width % 2) ? pad_size / 2 : (pad_size - pad_size / 2); + auto left_pad = pad_size - right_pad; // e.g. width=7: "++foxx+"; width=6: "+fox++" + while (left_pad-- > 0) { + output += from_char_utf8(fill_char, output); + } + output = copy_string(output, d_str); + while (right_pad-- > 0) { + output += from_char_utf8(fill_char, output); + } + } +} + +/** + * @brief Prepend d_str with '0' into output up to width characters + * + * Pad does not perform truncation. That is, If `d_str.length() > width` then + * then d_str is copied into output. + * + * If d_str starts with a sign character ('-' or '+') then '0' padding + * starts after the sign. + * + * @param d_str String to pad + * @param width Number of characters for the padded string result + * @param output Device memory to copy the padded string into + */ +__device__ void zfill_impl(cudf::string_view d_str, cudf::size_type width, char* output) +{ + auto length = d_str.length(); + auto in_ptr = d_str.data(); + // if the string starts with a sign, output the sign first + if (!d_str.empty() && (*in_ptr == '-' || *in_ptr == '+')) { + *output++ = *in_ptr++; + d_str = cudf::string_view{in_ptr, d_str.size_bytes() - 1}; + } + while (length++ < width) + *output++ = '0'; // prepend zero char + copy_string(output, d_str); +} + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp index ce1d5e8a925..aa6fb2feb3d 100644 --- a/cpp/include/cudf/strings/detail/replace.hpp +++ b/cpp/include/cudf/strings/detail/replace.hpp @@ -43,13 +43,12 @@ enum class replace_algorithm { * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ template -std::unique_ptr replace( - strings_column_view const& strings, - string_scalar const& target, - string_scalar const& repl, - int32_t maxrepl = -1, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace(strings_column_view const& strings, + string_scalar const& target, + string_scalar const& repl, + int32_t maxrepl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&, @@ -57,13 +56,12 @@ std::unique_ptr replace( * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr replace_slice( - strings_column_view const& strings, - string_scalar const& repl = string_scalar(""), - size_type start = 0, - size_type stop = -1, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace_slice(strings_column_view const& strings, + string_scalar const& repl, + size_type start, + size_type stop, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&, @@ -71,12 +69,11 @@ std::unique_ptr replace_slice( * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr replace( - strings_column_view const& strings, - strings_column_view const& targets, - strings_column_view const& repls, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace(strings_column_view const& strings, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Replaces any null string entries with the given string. @@ -96,11 +93,10 @@ std::unique_ptr replace( * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ -std::unique_ptr replace_nulls( - strings_column_view const& strings, - string_scalar const& repl = string_scalar(""), - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace_nulls(strings_column_view const& strings, + string_scalar const& repl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index d430f390f10..55dd5bda260 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -57,18 +57,18 @@ namespace detail { * @return New strings column. */ template -std::unique_ptr scatter( - SourceIterator begin, - SourceIterator end, - MapIterator scatter_map, - strings_column_view const& target, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr scatter(SourceIterator begin, + SourceIterator end, + MapIterator scatter_map, + strings_column_view const& target, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (target.is_empty()) return make_empty_column(type_id::STRING); // create vector of string_view's to scatter into - rmm::device_uvector target_vector = create_string_vector_from_column(target, stream); + rmm::device_uvector target_vector = + create_string_vector_from_column(target, stream, rmm::mr::get_current_device_resource()); // this ensures empty strings are not mapped to nulls in the make_strings_column function auto const size = thrust::distance(begin, end); @@ -76,7 +76,8 @@ std::unique_ptr scatter( begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; }); // do the scatter - thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin()); + thrust::scatter( + rmm::exec_policy_nosync(stream), itr, itr + size, scatter_map, target_vector.begin()); // build the output column auto sv_span = cudf::device_span(target_vector); diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index 592f2128d0e..76e5f931981 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -50,11 +50,10 @@ namespace detail { * @return offsets child column for strings column */ template -std::unique_ptr make_offsets_child_column( - InputIterator begin, - InputIterator end, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr make_offsets_child_column(InputIterator begin, + InputIterator end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(begin < end, "Invalid iterator range"); auto count = thrust::distance(begin, end); @@ -117,12 +116,11 @@ __device__ inline char* copy_string(char* buffer, const string_view& d_string) * @return offsets child column and chars child column for a strings column */ template -auto make_strings_children( - SizeAndExecuteFunction size_and_exec_fn, - size_type exec_size, - size_type strings_count, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type exec_size, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto offsets_column = make_numeric_column( data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); @@ -175,11 +173,10 @@ auto make_strings_children( * @return offsets child column and chars child column for a strings column */ template -auto make_strings_children( - SizeAndExecuteFunction size_and_exec_fn, - size_type strings_count, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr); } diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index ceae93dfe84..41a2654dce3 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -36,10 +36,9 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return The chars child column for a strings column. */ -std::unique_ptr create_chars_child_column( - size_type bytes, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr create_chars_child_column(size_type bytes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Creates a string_view vector from a strings column. @@ -51,8 +50,8 @@ std::unique_ptr create_chars_child_column( */ rmm::device_uvector create_string_vector_from_column( cudf::strings_column_view const strings, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp index a30098bedb9..a80d971438d 100644 --- a/cpp/include/cudf/strings/extract.hpp +++ b/cpp/include/cudf/strings/extract.hpp @@ -23,6 +23,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_substring * @{ @@ -61,6 +64,37 @@ std::unique_ptr
extract( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a table of strings columns where each column corresponds to the matching + * group specified in the given regex_program object + * + * All the strings for the first group will go in the first output column; the second group + * go in the second column and so on. Null entries are added to the columns in row `i` if + * the string at row `i` does not match. + * + * Any null string entries return corresponding null output column entries. + * + * @code{.pseudo} + * Example: + * s = ["a1", "b2", "c3"] + * p = regex_program::create("([ab])(\\d)") + * r = extract(s, p) + * r is now [ ["a", "b", null], + * ["1", "2", null] ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned table's device memory + * @return Columns of strings extracted from the input column + */ +std::unique_ptr
extract( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns a lists column of strings where each string column row corresponds to the * matching group specified in the given regular expression pattern. @@ -96,6 +130,40 @@ std::unique_ptr extract_all_record( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a lists column of strings where each string column row corresponds to the + * matching group specified in the given regex_program object + * + * All the matching groups for the first row will go in the first row output column; the second + * row results will go into the second row output column and so on. + * + * A null output row will result if the corresponding input string row does not match or + * that input row is null. + * + * @code{.pseudo} + * Example: + * s = ["a1 b4", "b2", "c3 a5", "b", null] + * p = regex_program::create("([ab])(\\d)") + * r = extract_all_record(s, p) + * r is now [ ["a", "1", "b", "4"], + * ["b", "2"], + * ["a", "5"], + * null, + * null ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate any returned device memory + * @return Lists column containing strings extracted from the input column + */ +std::unique_ptr extract_all_record( + strings_column_view const& strings, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index 6969ba35b1b..366e1eb0482 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -23,6 +23,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_contains * @{ @@ -63,6 +66,39 @@ std::unique_ptr findall( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a lists column of strings for each matching occurrence using + * the regex_program pattern within each string + * + * Each output row includes all the substrings within the corresponding input row + * that match the given pattern. If no matches are found, the output row is empty. + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * p = regex_program::create("[ab]") + * r = findall(s, p) + * r is now a lists column like: + * [ ["b"] + * ["a","b","b"] + * ["a"] + * [] ] + * @endcode + * + * A null output row occurs if the corresponding input row is null. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input Strings instance for this operation + * @param prog Regex program instance + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New lists column of strings + */ +std::unique_ptr findall( + strings_column_view const& input, + regex_program const& prog, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index 3a7051345fa..44ca68439e7 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -21,7 +21,7 @@ namespace cudf { namespace strings { /** - * @addtogroup strings_contains + * @addtogroup strings_regex * @{ */ diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp new file mode 100644 index 00000000000..2b606393719 --- /dev/null +++ b/cpp/include/cudf/strings/regex/regex_program.hpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include + +namespace cudf { +namespace strings { + +/** + * @addtogroup strings_regex + * @{ + */ + +/** + * @brief Regex program class + * + * Create an instance from a regex pattern and use it to call the appropriate + * strings APIs. An instance can be reused. + * + * See the @ref md_regex "Regex Features" page for details on patterns and APIs that support regex. + */ +struct regex_program { + struct regex_program_impl; + + /** + * @brief Create a program from a pattern + * + * @throw cudf::logic_error If pattern is invalid or contains unsupported features + * + * @param pattern Regex pattern + * @param flags Regex flags for interpreting special characters in the pattern + * @param capture Controls how capture groups in the pattern are used + * @return Instance of this object + */ + static std::unique_ptr create(std::string_view pattern, + regex_flags flags = regex_flags::DEFAULT, + capture_groups capture = capture_groups::EXTRACT); + + /** + * @brief Move constructor + * + * @param other Object to move from + */ + regex_program(regex_program&& other); + + /** + * @brief Move operator assignment + * + * @param other Object to move from + * @return this object + */ + regex_program& operator=(regex_program&& other); + + /** + * @brief Return the pattern used to create this instance + * + * @return regex pattern as a string + */ + std::string pattern() const; + + /** + * @brief Return the regex_flags used to create this instance + * + * @return regex flags setting + */ + regex_flags flags() const; + + /** + * @brief Return the capture_groups used to create this instance + * + * @return capture groups setting + */ + capture_groups capture() const; + + /** + * @brief Return the number of instructions in this instance + * + * @return Number of instructions + */ + int32_t instructions_count() const; + + /** + * @brief Return the number of capture groups in this instance + * + * @return Number of groups + */ + int32_t groups_count() const; + + /** + * @brief Return the pattern used to create this instance + * + * @param num_strings Number of strings for computation + * @return Size of the working memory in bytes + */ + std::size_t compute_working_memory_size(int32_t num_strings) const; + + ~regex_program(); + + private: + regex_program() = delete; + + std::string _pattern; + regex_flags _flags; + capture_groups _capture; + + std::unique_ptr _impl; + + /** + * @brief Constructor + * + * Called by create() + */ + regex_program(std::string_view pattern, regex_flags flags, capture_groups capture); + + friend struct regex_device_builder; +}; + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index d80b9a89b81..60c66956fb8 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -26,6 +26,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_replace * @{ @@ -58,6 +61,30 @@ std::unique_ptr replace_re( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief For each string, replaces any character sequence matching the given regex + * with the provided replacement string. + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param replacement The string used to replace the matched sequence in each string. + * Default is an empty string. + * @param max_replace_count The maximum number of times to replace the matched pattern + * within each string. Default replaces every substring that is matched. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ +std::unique_ptr replace_re( + strings_column_view const& strings, + regex_program const& prog, + string_scalar const& replacement = string_scalar(""), + std::optional max_replace_count = std::nullopt, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief For each string, replaces any character sequence matching the given patterns * with the corresponding string in the `replacements` column. @@ -105,5 +132,28 @@ std::unique_ptr replace_with_backrefs( regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief For each string, replaces any character sequence matching the given regex + * using the replacement template for back-references. + * + * Any null string entries return corresponding null output column entries. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also + * if the index exceeds the group count specified in the pattern + * + * @param strings Strings instance for this operation + * @param prog Regex program instance + * @param replacement The replacement template for creating the output string + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column + */ +std::unique_ptr replace_with_backrefs( + strings_column_view const& strings, + regex_program const& prog, + std::string_view replacement, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index 6fe07b0f5dc..c6bd1345ae6 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -23,6 +23,9 @@ namespace cudf { namespace strings { + +struct regex_program; + /** * @addtogroup strings_split * @{ @@ -77,6 +80,58 @@ std::unique_ptr
split_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a table of strings columns + * using a regex_program's pattern to delimit each string + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of strings[row]` + * where `token` is a substring between delimiters. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = split_re(s, p1) + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * p2 = regex_program::create("[ _]") + * s2 = split_re(s, p2, 1) + * s2 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc def_g", "_bc", "ab cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return A table of columns of strings + */ +std::unique_ptr
split_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits strings elements into a table of strings columns * using a regex pattern to delimit each string starting from the end of the string. @@ -127,6 +182,60 @@ std::unique_ptr
rsplit_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a table of strings columns using a + * regex_program's pattern to delimit each string starting from the end of the string + * + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. + * + * The number of rows in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = rsplit_re(s, p1) + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * p2 = regex_program::create("[ _]") + * s2 = rsplit_re(s, p2, 1) + * s2 is a table of strings columns: + * [ ["a_bc def", "a_", "_ab", "ab"], + * ["g", "bc", "cd", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split. + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
rsplit_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string. @@ -179,6 +288,62 @@ std::unique_ptr split_record_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a list column of strings + * using the given regex_program to delimit each string + * + * Each element generates an array of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * The `pattern` is used to identify the delimiters within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = split_record_re(s, p1) + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * p2 = regex_program::create("[ _]") + * s2 = split_record_re(s, p2, 1) + * s2 is a lists column of strings: + * [ ["a", "bc def_g"], + * ["a", "_bc"], + * ["", "ab cd"], + * ["ab", "cd "] ] + * @endcode + * + * @throw cudf::logic_error if `pattern` is empty. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input A column of string elements to be split + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return Lists column of strings. + */ +std::unique_ptr split_record_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string starting from the end of the string. @@ -233,6 +398,64 @@ std::unique_ptr rsplit_record_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits strings elements into a list column of strings using the given + * regex_program to delimit each string starting from the end of the string + * + * Each element generates a vector of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * The regex_program's regex_flags are ignored. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * p1 = regex_program::create("[_ ]") + * s1 = rsplit_record_re(s, p1) + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * p2 = regex_program::create("[ _]") + * s2 = rsplit_record_re(s, p2, 1) + * s2 is a lists column of strings: + * [ ["a_bc def", "g"], + * ["a_", "bc"], + * ["_ab", "cd"], + * ["ab_cd", ""] ] + * @endcode + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @throw cudf::logic_error if `pattern` is empty. + * + * @param input A column of string elements to be split + * @param prog Regex program instance + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory + * @return Lists column of strings + */ +std::unique_ptr rsplit_record_re( + strings_column_view const& input, + regex_program const& prog, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp index 03bf538b1b2..265adc60392 100644 --- a/cpp/include/cudf/strings/string_view.hpp +++ b/cpp/include/cudf/strings/string_view.hpp @@ -17,6 +17,8 @@ #include +#include + #include /** diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index af7091fc00c..e7b0c6eb6b6 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -820,7 +820,7 @@ class self_comparator { self_comparator(table_view const& t, host_span column_order = {}, host_span null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream = cudf::get_default_stream()) : d_t{preprocessed_table::create(t, column_order, null_precedence, stream)} { } @@ -962,7 +962,7 @@ class two_table_comparator { table_view const& right, host_span column_order = {}, host_span null_precedence = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value); + rmm::cuda_stream_view stream = cudf::get_default_stream()); /** * @brief Construct an owning object for performing a lexicographic comparison between two rows of diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 3b803c2b949..6d11ed0bfad 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -69,7 +69,7 @@ class table { * @param mr Device memory resource used for allocating the device memory for the new columns */ table(table_view view, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh index 9f6930b57f5..511013b585d 100644 --- a/cpp/include/cudf/table/table_device_view.cuh +++ b/cpp/include/cudf/table/table_device_view.cuh @@ -175,7 +175,7 @@ class table_device_view : public detail::table_device_view_basedestroy(); }; return std::unique_ptr{ @@ -212,7 +212,7 @@ class mutable_table_device_view * available in device memory */ static auto create(mutable_table_view source_view, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream = cudf::get_default_stream()) { auto deleter = [](mutable_table_device_view* t) { t->destroy(); }; return std::unique_ptr{ diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.cuh b/cpp/include/cudf/tdigest/tdigest_column_view.hpp similarity index 83% rename from cpp/include/cudf/tdigest/tdigest_column_view.cuh rename to cpp/include/cudf/tdigest/tdigest_column_view.hpp index 64371fd5c45..c63e2b16326 100644 --- a/cpp/include/cudf/tdigest/tdigest_column_view.cuh +++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp @@ -16,30 +16,11 @@ #pragma once #include -#include #include namespace cudf { namespace tdigest { -/** - * @brief Functor to compute the size of each tdigest of a column. - * - */ -struct tdigest_size { - size_type const* offsets; ///< Offsets of the t-digest column - /** - * @brief Returns size of the each tdigest in the column - * - * @param tdigest_index Index of the tdigest in the column - * @return Size of the tdigest - */ - __device__ size_type operator()(size_type tdigest_index) - { - return offsets[tdigest_index + 1] - offsets[tdigest_index]; - } -}; - /** * @brief Given a column_view containing tdigest data, an instance of this class * provides a wrapper on the compound column for tdigest operations. @@ -127,18 +108,6 @@ class tdigest_column_view : private column_view { */ [[nodiscard]] column_view weights() const; - /** - * @brief Returns an iterator that returns the size of each tdigest - * in the column (each row is 1 digest) - * - * @return An iterator that returns the size of each tdigest in the column - */ - [[nodiscard]] auto size_begin() const - { - return cudf::detail::make_counting_transform_iterator( - 0, tdigest_size{centroids().offsets_begin()}); - } - /** * @brief Returns the first min value for the column. Each row corresponds * to the minimum value for the accompanying digest. diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp index 94bc01787e3..1eec3b994d0 100644 --- a/cpp/include/cudf/utilities/default_stream.hpp +++ b/cpp/include/cudf/utilities/default_stream.hpp @@ -16,21 +16,19 @@ #pragma once +#include + +#include #include namespace cudf { /** - * @brief Default stream for cudf + * @brief Get the current default stream * - * Use this value to ensure the correct stream is used when compiled with per - * thread default stream. + * @return The current default stream. */ -#if defined(CUDF_USE_PER_THREAD_DEFAULT_STREAM) -static const rmm::cuda_stream_view default_stream_value{rmm::cuda_stream_per_thread}; -#else -static constexpr rmm::cuda_stream_view default_stream_value{}; -#endif +rmm::cuda_stream_view const get_default_stream(); /** * @brief Check if per-thread default stream is enabled. diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index dcb9786bbd2..074e8d25bf7 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -226,7 +226,7 @@ struct host_span : public cudf::detail::span_base().data()))> (*)[], T (*)[]>>* = nullptr> - constexpr host_span(C& in) : base(in.data(), in.size()) + constexpr host_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } @@ -239,7 +239,7 @@ struct host_span : public cudf::detail::span_base().data()))> (*)[], T (*)[]>>* = nullptr> - constexpr host_span(C const& in) : base(in.data(), in.size()) + constexpr host_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size()) { } diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp index d95ea42a039..43d43ba6bb3 100644 --- a/cpp/include/cudf/utilities/traits.hpp +++ b/cpp/include/cudf/utilities/traits.hpp @@ -261,6 +261,30 @@ constexpr inline bool is_signed_iterator() return std::is_signed_v::value_type>; } +/** + * @brief Indicates whether the type `T` is an integral type. + * + * @tparam T The type to verify + * @return true `T` is integral + * @return false `T` is not integral + */ +template +constexpr inline bool is_integral() +{ + return cuda::std::is_integral_v; +} + +/** + * @brief Indicates whether `type` is a integral `data_type`. + * + * "Integral" types are fundamental integer types such as `INT*` and `UINT*`. + * + * @param type The `data_type` to verify + * @return true `type` is integral + * @return false `type` is integral + */ +bool is_integral(data_type type); + /** * @brief Indicates whether the type `T` is a floating point type. * diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp index e529785a758..be4d5bccd7b 100644 --- a/cpp/include/cudf_test/base_fixture.hpp +++ b/cpp/include/cudf_test/base_fixture.hpp @@ -18,12 +18,15 @@ #include +#include #include #include #include #include #include +#include +#include #include #include #include @@ -303,11 +306,18 @@ inline auto parse_cudf_test_opts(int argc, char** argv) try { cxxopts::Options options(argv[0], " - cuDF tests command line options"); const char* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE"); // Overridden by CLI options + const char* env_stream_mode = + std::getenv("GTEST_CUDF_STREAM_MODE"); // Overridden by CLI options auto default_rmm_mode = env_rmm_mode ? env_rmm_mode : "pool"; + auto default_stream_mode = env_stream_mode ? env_stream_mode : "default"; options.allow_unrecognised_options().add_options()( "rmm_mode", "RMM allocation mode", cxxopts::value()->default_value(default_rmm_mode)); + options.allow_unrecognised_options().add_options()( + "stream_mode", + "Whether to use a non-default stream", + cxxopts::value()->default_value(default_stream_mode)); return options.parse(argc, argv); } catch (const cxxopts::OptionException& e) { CUDF_FAIL("Error parsing command line options"); @@ -324,13 +334,21 @@ inline auto parse_cudf_test_opts(int argc, char** argv) * function parses the command line to customize test behavior, like the * allocation mode used for creating the default memory resource. */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cudf::test::create_memory_resource(rmm_mode); \ - rmm::mr::set_current_device_resource(resource.get()); \ - return RUN_ALL_TESTS(); \ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ + auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ + auto resource = cudf::test::create_memory_resource(rmm_mode); \ + rmm::mr::set_current_device_resource(resource.get()); \ + \ + auto const stream_mode = cmd_opts["stream_mode"].as(); \ + rmm::cuda_stream const new_default_stream{}; \ + if (stream_mode == "custom") { \ + auto adapter = make_stream_checking_resource_adaptor(resource.get()); \ + rmm::mr::set_current_device_resource(&adapter); \ + } \ + \ + return RUN_ALL_TESTS(); \ } diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp index d41ea530402..2cc90743912 100644 --- a/cpp/include/cudf_test/column_utilities.hpp +++ b/cpp/include/cudf_test/column_utilities.hpp @@ -107,6 +107,13 @@ bool expect_columns_equivalent(cudf::column_view const& lhs, debug_output_level verbosity = debug_output_level::FIRST_ERROR, size_type fp_ulps = cudf::test::default_ulp); +/** + * @brief Verifies the given column is empty + * + * @param col The column to check + */ +void expect_column_empty(cudf::column_view const& col); + /** * @brief Verifies the bitwise equality of two device memory buffers. * @@ -234,11 +241,11 @@ inline std::pair, std::vector> to auto const scv = strings_column_view(c); auto const h_chars = cudf::detail::make_std_vector_sync( cudf::device_span(scv.chars().data(), scv.chars().size()), - cudf::default_stream_value); + cudf::get_default_stream()); auto const h_offsets = cudf::detail::make_std_vector_sync( cudf::device_span( scv.offsets().data() + scv.offset(), scv.size() + 1), - cudf::default_stream_value); + cudf::get_default_stream()); // build std::string vector from chars and offsets std::vector host_data; diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index 8827372b3fd..91773b2c3f1 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -170,7 +170,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end) auto transform_begin = thrust::make_transform_iterator(begin, transformer); auto const size = cudf::distance(begin, end); auto const elements = thrust::host_vector(transform_begin, transform_begin + size); - return rmm::device_buffer{elements.data(), size * sizeof(ElementTo), cudf::default_stream_value}; + return rmm::device_buffer{elements.data(), size * sizeof(ElementTo), cudf::get_default_stream()}; } /** @@ -196,7 +196,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end) auto transform_begin = thrust::make_transform_iterator(begin, transformer); auto const size = cudf::distance(begin, end); auto const elements = thrust::host_vector(transform_begin, transform_begin + size); - return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::default_stream_value}; + return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::get_default_stream()}; } /** @@ -223,7 +223,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end) auto transformer_begin = thrust::make_transform_iterator(begin, to_rep); auto const size = cudf::distance(begin, end); auto const elements = thrust::host_vector(transformer_begin, transformer_begin + size); - return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::default_stream_value}; + return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::get_default_stream()}; } /** @@ -271,7 +271,7 @@ rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end) auto null_mask = make_null_mask_vector(begin, end); return rmm::device_buffer{null_mask.data(), null_mask.size() * sizeof(decltype(null_mask.front())), - cudf::default_stream_value}; + cudf::get_default_stream()}; } /** @@ -547,7 +547,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper { wrapped.reset(new cudf::column{ data_type, size, - rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::default_stream_value}}); + rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()}}); } /** @@ -611,7 +611,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper { wrapped.reset(new cudf::column{ data_type, size, - rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::default_stream_value}, + rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()}, detail::make_null_mask(v, v + size), cudf::UNKNOWN_NULL_COUNT}); } @@ -732,9 +732,9 @@ class strings_column_wrapper : public detail::column_wrapper { { auto all_valid = thrust::make_constant_iterator(true); auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid); - auto d_chars = cudf::detail::make_device_uvector_sync(chars); - auto d_offsets = cudf::detail::make_device_uvector_sync(offsets); - wrapped = cudf::make_strings_column(d_chars, d_offsets); + auto d_chars = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream()); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream()); + wrapped = cudf::make_strings_column(d_chars, d_offsets); } /** @@ -772,10 +772,10 @@ class strings_column_wrapper : public detail::column_wrapper { size_type num_strings = std::distance(begin, end); auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v); auto null_mask = detail::make_null_mask_vector(v, v + num_strings); - auto d_chars = cudf::detail::make_device_uvector_sync(chars); - auto d_offsets = cudf::detail::make_device_uvector_sync(offsets); - auto d_bitmask = cudf::detail::make_device_uvector_sync(null_mask); - wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask); + auto d_chars = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream()); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream()); + auto d_bitmask = cudf::detail::make_device_uvector_sync(null_mask, cudf::get_default_stream()); + wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask); } /** diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp index fb2680545d3..ab45d90f2d2 100644 --- a/cpp/include/cudf_test/cudf_gtest.hpp +++ b/cpp/include/cudf_test/cudf_gtest.hpp @@ -110,58 +110,6 @@ struct TypeList> { */ #define EXPECT_CUDA_SUCCEEDED(expr) EXPECT_EQ(cudaSuccess, expr) -/** - * @brief Utility for testing the expectation that an expression x throws the specified - * exception whose what() message ends with the msg - * - * @param x The expression to test - * @param exception The exception type to test for - * @param startswith The start of the expected message - * @param endswith The end of the expected message - */ -#define EXPECT_THROW_MESSAGE(x, exception, startswith, endswith) \ - do { \ - EXPECT_THROW( \ - { \ - try { \ - x; \ - } catch (const exception& e) { \ - ASSERT_NE(nullptr, e.what()); \ - EXPECT_THAT(e.what(), testing::StartsWith((startswith))); \ - EXPECT_THAT(e.what(), testing::EndsWith((endswith))); \ - throw; \ - } \ - }, \ - exception); \ - } while (0) - -/** - * @brief test macro to be expected to throw cudf::logic_error with a message - * - * @param x The statement to be tested - * @param msg The message associated with the exception - */ -#define CUDF_EXPECT_THROW_MESSAGE(x, msg) \ - EXPECT_THROW_MESSAGE(x, cudf::logic_error, "cuDF failure at:", msg) - -/** - * @brief test macro to be expected to throw cudf::cuda_error with a message - * - * @param x The statement to be tested - * @param msg The message associated with the exception - */ -#define CUDA_EXPECT_THROW_MESSAGE(x, msg) \ - EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg) - -/** - * @brief test macro to be expected to throw cudf::fatal_logic_error with a message - * - * @param x The statement to be tested - * @param msg The message associated with the exception - */ -#define FATAL_CUDA_EXPECT_THROW_MESSAGE(x, msg) \ - EXPECT_THROW_MESSAGE(x, cudf::fatal_cuda_error, "Fatal CUDA error encountered at:", msg) - /** * @brief test macro to be expected as no exception. * diff --git a/cpp/include/cudf_test/detail/column_utilities.hpp b/cpp/include/cudf_test/detail/column_utilities.hpp index ddf3b658a86..f8270f61f10 100644 --- a/cpp/include/cudf_test/detail/column_utilities.hpp +++ b/cpp/include/cudf_test/detail/column_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -82,4 +82,4 @@ std::vector to_strings(cudf::column_view const& col, std::string co } // namespace detail } // namespace test -} // namespace cudf \ No newline at end of file +} // namespace cudf diff --git a/cpp/include/cudf_test/stream_checking_resource_adapter.hpp b/cpp/include/cudf_test/stream_checking_resource_adapter.hpp new file mode 100644 index 00000000000..4a22ff148ae --- /dev/null +++ b/cpp/include/cudf_test/stream_checking_resource_adapter.hpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +/** + * @brief Resource that verifies that the default stream is not used in any allocation. + * + * @tparam Upstream Type of the upstream resource used for + * allocation/deallocation. + */ +template +class stream_checking_resource_adaptor final : public rmm::mr::device_memory_resource { + public: + /** + * @brief Construct a new adaptor. + * + * @throws `cudf::logic_error` if `upstream == nullptr` + * + * @param upstream The resource used for allocating/deallocating device memory + */ + stream_checking_resource_adaptor(Upstream* upstream) : upstream_{upstream} + { + CUDF_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer."); + } + + stream_checking_resource_adaptor() = delete; + ~stream_checking_resource_adaptor() override = default; + stream_checking_resource_adaptor(stream_checking_resource_adaptor const&) = delete; + stream_checking_resource_adaptor& operator=(stream_checking_resource_adaptor const&) = delete; + stream_checking_resource_adaptor(stream_checking_resource_adaptor&&) noexcept = default; + stream_checking_resource_adaptor& operator=(stream_checking_resource_adaptor&&) noexcept = + default; + + /** + * @brief Return pointer to the upstream resource. + * + * @return Pointer to the upstream resource. + */ + Upstream* get_upstream() const noexcept { return upstream_; } + + /** + * @brief Checks whether the upstream resource supports streams. + * + * @return Whether or not the upstream resource supports streams + */ + bool supports_streams() const noexcept override { return upstream_->supports_streams(); } + + /** + * @brief Query whether the resource supports the get_mem_info API. + * + * @return Whether or not the upstream resource supports get_mem_info + */ + bool supports_get_mem_info() const noexcept override + { + return upstream_->supports_get_mem_info(); + } + + private: + /** + * @brief Allocates memory of size at least `bytes` using the upstream + * resource as long as it fits inside the allocation limit. + * + * The returned pointer has at least 256B alignment. + * + * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled + * by the upstream resource. + * @throws `cudf::logic_error` if attempted on a default stream + * + * @param bytes The size, in bytes, of the allocation + * @param stream Stream on which to perform the allocation + * @return Pointer to the newly allocated memory + */ + void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override + { + verify_non_default_stream(stream); + return upstream_->allocate(bytes, stream); + } + + /** + * @brief Free allocation of size `bytes` pointed to by `ptr` + * + * @throws `cudf::logic_error` if attempted on a default stream + * + * @param ptr Pointer to be deallocated + * @param bytes Size of the allocation + * @param stream Stream on which to perform the deallocation + */ + void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override + { + verify_non_default_stream(stream); + upstream_->deallocate(ptr, bytes, stream); + } + + /** + * @brief Compare the upstream resource to another. + * + * @param other The other resource to compare to + * @return Whether or not the two resources are equivalent + */ + bool do_is_equal(device_memory_resource const& other) const noexcept override + { + if (this == &other) { return true; } + auto cast = dynamic_cast const*>(&other); + return cast != nullptr ? upstream_->is_equal(*cast->get_upstream()) + : upstream_->is_equal(other); + } + + /** + * @brief Get free and available memory from upstream resource. + * + * @throws `rmm::cuda_error` if unable to retrieve memory info. + * @throws `cudf::logic_error` if attempted on a default stream + * + * @param stream Stream on which to get the mem info. + * @return std::pair with available and free memory for resource + */ + std::pair do_get_mem_info(rmm::cuda_stream_view stream) const override + { + verify_non_default_stream(stream); + return upstream_->get_mem_info(stream); + } + + /** + * @brief Throw an error if given one of CUDA's default stream specifiers. + * + * @throws `std::runtime_error` if provided a default stream + */ + void verify_non_default_stream(rmm::cuda_stream_view const stream) const + { + auto cstream{stream.value()}; + if (cstream == cudaStreamDefault || (cstream == cudaStreamLegacy) || + (cstream == cudaStreamPerThread)) { + throw std::runtime_error("Attempted to perform an operation on a default stream!"); + } + } + + Upstream* upstream_; // the upstream resource used for satisfying allocation requests +}; + +/** + * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the + * upstream resource `upstream`. + * + * @tparam Upstream Type of the upstream `device_memory_resource`. + * @param upstream Pointer to the upstream resource + */ +template +stream_checking_resource_adaptor make_stream_checking_resource_adaptor(Upstream* upstream) +{ + return stream_checking_resource_adaptor{upstream}; +} diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh index 6f206a789fd..ce45ad91be1 100644 --- a/cpp/include/cudf_test/tdigest_utilities.cuh +++ b/cpp/include/cudf_test/tdigest_utilities.cuh @@ -16,16 +16,14 @@ #pragma once +#include + #include #include #include -#include +#include #include -#include - -#include - #include #include #include @@ -102,6 +100,58 @@ struct tdigest_gen { // @endcond }; +template +inline T frand() +{ + return static_cast(rand()) / static_cast(RAND_MAX); +} + +template +inline T rand_range(T min, T max) +{ + return min + static_cast(frand() * (max - min)); +} + +inline std::unique_ptr generate_typed_percentile_distribution( + std::vector const& buckets, + std::vector const& sizes, + data_type t, + bool sorted = false) +{ + srand(0); + + std::vector values; + size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0); + values.reserve(total_size); + for (size_t idx = 0; idx < sizes.size(); idx++) { + double min = idx == 0 ? 0.0f : buckets[idx - 1]; + double max = buckets[idx]; + + for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) { + values.push_back(rand_range(min, max)); + } + } + + if (sorted) { std::sort(values.begin(), values.end()); } + + cudf::test::fixed_width_column_wrapper src(values.begin(), values.end()); + return cudf::cast(src, t); +} + +// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent +// is to provide a standardized set of inputs for use with tdigest generation tests and +// percentile_approx tests. std::vector +// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector +// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000}; +inline std::unique_ptr generate_standardized_percentile_distribution( + data_type t = data_type{type_id::FLOAT64}, bool sorted = false) +{ + std::vector buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f}; + std::vector b_sizes{ + 50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000}; + return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted); +} + /** * @brief Compare a tdigest column against a sampling of expected values. */ @@ -118,11 +168,11 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv, // verify min/max thrust::host_vector> h_spans; h_spans.push_back({input_values.begin(), static_cast(input_values.size())}); - thrust::device_vector> spans(h_spans); + auto spans = cudf::detail::make_device_uvector_async(h_spans, cudf::get_default_stream()); auto expected_min = cudf::make_fixed_width_column( data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED); - thrust::transform(rmm::exec_policy(cudf::default_stream_value), + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), spans.begin(), spans.end(), expected_min->mutable_view().template begin(), @@ -132,7 +182,7 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv, auto expected_max = cudf::make_fixed_width_column( data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED); - thrust::transform(rmm::exec_policy(cudf::default_stream_value), + thrust::transform(rmm::exec_policy(cudf::get_default_stream()), spans.begin(), spans.end(), expected_max->mutable_view().template begin(), @@ -217,7 +267,7 @@ void tdigest_simple_all_nulls_aggregation(Func op) static_cast(values).type(), tdigest_gen{}, op, values, delta); // NOTE: an empty tdigest column still has 1 row. - auto expected = cudf::detail::tdigest::make_empty_tdigest_column(); + auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected); } @@ -508,9 +558,9 @@ template void tdigest_merge_empty(MergeFunc merge_op) { // 3 empty tdigests all in the same group - auto a = cudf::detail::tdigest::make_empty_tdigest_column(); - auto b = cudf::detail::tdigest::make_empty_tdigest_column(); - auto c = cudf::detail::tdigest::make_empty_tdigest_column(); + auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream()); + auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream()); + auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream()); std::vector cols; cols.push_back(*a); cols.push_back(*b); @@ -520,7 +570,7 @@ void tdigest_merge_empty(MergeFunc merge_op) auto const delta = 1000; auto result = merge_op(*values, delta); - auto expected = cudf::detail::tdigest::make_empty_tdigest_column(); + auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index c0ea06959b2..5c335b720d5 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -129,6 +129,7 @@ * @defgroup strings_replace Replacing * @defgroup strings_split Splitting * @defgroup strings_json JSON + * @defgroup strings_regex Regex * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp index 97e354cb39b..b93d93b07c6 100644 --- a/cpp/include/nvtext/bpe_tokenize.hpp +++ b/cpp/include/nvtext/bpe_tokenize.hpp @@ -46,7 +46,7 @@ struct bpe_merge_pairs { * @param mr Device memory resource used to allocate the device memory */ bpe_merge_pairs(std::unique_ptr&& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -57,7 +57,7 @@ struct bpe_merge_pairs { * @param mr Device memory resource used to allocate the device memory */ bpe_merge_pairs(cudf::strings_column_view const& input, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); ~bpe_merge_pairs(); diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp index 2b5d0bb855e..38b49e63590 100644 --- a/cpp/include/nvtext/detail/tokenize.hpp +++ b/cpp/include/nvtext/detail/tokenize.hpp @@ -35,11 +35,10 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings columns of tokens. */ -std::unique_ptr tokenize( - cudf::strings_column_view const& strings, - cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr tokenize(cudf::strings_column_view const& strings, + cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view @@ -51,11 +50,10 @@ std::unique_ptr tokenize( * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings columns of tokens. */ -std::unique_ptr tokenize( - cudf::strings_column_view const& strings, - cudf::strings_column_view const& delimiters, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr tokenize(cudf::strings_column_view const& strings, + cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar @@ -68,11 +66,10 @@ std::unique_ptr tokenize( * @param mr Device memory resource used to allocate the returned column's device memory. * @return New INT32 column of token counts. */ -std::unique_ptr count_tokens( - cudf::strings_column_view const& strings, - cudf::string_scalar const& delimiter = cudf::string_scalar{""}, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr count_tokens(cudf::strings_column_view const& strings, + cudf::string_scalar const& delimiter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view @@ -84,11 +81,10 @@ std::unique_ptr count_tokens( * @param mr Device memory resource used to allocate the returned column's device memory. * @return New INT32 column of token counts. */ -std::unique_ptr count_tokens( - cudf::strings_column_view const& strings, - cudf::strings_column_view const& delimiters, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr count_tokens(cudf::strings_column_view const& strings, + cudf::strings_column_view const& delimiters, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace nvtext diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index f355fa01c28..da3b6b8af62 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -22,7 +22,7 @@ include(rapids-find) project( CUDA_KAFKA - VERSION 22.10.00 + VERSION 22.12.00 LANGUAGES CXX ) diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh index b9157c76492..f3e21779aa5 100755 --- a/cpp/scripts/run-cmake-format.sh +++ b/cpp/scripts/run-cmake-format.sh @@ -17,7 +17,7 @@ # and exits gracefully if the file is not found. If a user wishes to specify a # config file at a nonstandard location, they may do so by setting the # environment variable RAPIDS_CMAKE_FORMAT_FILE. -# +# # This script can be invoked directly anywhere within the project repository. # Alternatively, it may be invoked as a pre-commit hook via # `pre-commit run (cmake-format)|(cmake-lint)`. diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index 4b79cc0581a..83ad8aa4cee 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -406,7 +406,7 @@ std::unique_ptr binary_operation(scalar const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::default_stream_value, mr); + return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); } std::unique_ptr binary_operation(column_view const& lhs, scalar const& rhs, @@ -415,7 +415,7 @@ std::unique_ptr binary_operation(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::default_stream_value, mr); + return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); } std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, @@ -424,7 +424,7 @@ std::unique_ptr binary_operation(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::default_stream_value, mr); + return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); } std::unique_ptr binary_operation(column_view const& lhs, @@ -434,7 +434,7 @@ std::unique_ptr binary_operation(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::default_stream_value, mr); + return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp index 1f711b7c899..c51993409ef 100644 --- a/cpp/src/binaryop/compiled/binary_ops.hpp +++ b/cpp/src/binaryop/compiled/binary_ops.hpp @@ -37,7 +37,7 @@ std::unique_ptr string_null_min_max( column_view const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr string_null_min_max( @@ -45,7 +45,7 @@ std::unique_ptr string_null_min_max( scalar const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr string_null_min_max( @@ -53,7 +53,7 @@ std::unique_ptr string_null_min_max( column_view const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -80,7 +80,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -107,7 +107,7 @@ std::unique_ptr binary_operation( scalar const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -133,7 +133,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); void binary_operation(mutable_column_view& out, diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh index def9ebcef97..2fcf1ce4e32 100644 --- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh +++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh @@ -71,7 +71,7 @@ void apply_struct_binary_op(mutable_column_view& out, bool is_lhs_scalar, bool is_rhs_scalar, PhysicalElementComparator comparator = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream = cudf::get_default_stream()) { auto const compare_orders = std::vector( lhs.size(), @@ -115,7 +115,7 @@ void apply_struct_equality_op(mutable_column_view& out, bool is_rhs_scalar, binary_operator op, PhysicalEqualityComparator comparator = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream = cudf::get_default_stream()) { CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL || op == binary_operator::NULL_EQUALS, diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 4c9151533c2..958bf21e6df 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -158,14 +158,14 @@ rmm::device_buffer create_null_mask(size_type size, mask_state state, rmm::mr::device_memory_resource* mr) { - return detail::create_null_mask(size, state, cudf::default_stream_value, mr); + return detail::create_null_mask(size, state, cudf::get_default_stream(), mr); } // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true, // or null, otherwise; void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid) { - return detail::set_null_mask(bitmask, begin_bit, end_bit, valid); + return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream()); } namespace detail { @@ -510,25 +510,25 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type end_bit, rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::default_stream_value, mr); + return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr); } // Create a bitmask from a column view rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr) { - return detail::copy_bitmask(view, cudf::default_stream_value, mr); + return detail::copy_bitmask(view, cudf::get_default_stream(), mr); } std::pair bitmask_and(table_view const& view, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_and(view, cudf::default_stream_value, mr); + return detail::bitmask_and(view, cudf::get_default_stream(), mr); } std::pair bitmask_or(table_view const& view, rmm::mr::device_memory_resource* mr) { - return detail::bitmask_or(view, cudf::default_stream_value, mr); + return detail::bitmask_or(view, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu index 61dfea6c26a..7b862373a5b 100644 --- a/cpp/src/column/column.cu +++ b/cpp/src/column/column.cu @@ -144,7 +144,7 @@ size_type column::null_count() const CUDF_FUNC_RANGE(); if (_null_count <= cudf::UNKNOWN_NULL_COUNT) { _null_count = cudf::detail::null_count( - static_cast(_null_mask.data()), 0, size(), cudf::default_stream_value); + static_cast(_null_mask.data()), 0, size(), cudf::get_default_stream()); } return _null_count; } @@ -182,7 +182,7 @@ void column::set_null_count(size_type new_null_count) namespace { struct create_column_from_view { cudf::column_view view; - rmm::cuda_stream_view stream{cudf::default_stream_value}; + rmm::cuda_stream_view stream{cudf::get_default_stream()}; rmm::mr::device_memory_resource* mr; template make_numeric_column(data_type type, { CUDF_FUNC_RANGE(); CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type."); + CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique(type, size, @@ -97,6 +98,7 @@ std::unique_ptr make_fixed_point_column(data_type type, { CUDF_FUNC_RANGE(); CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type."); + CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique(type, size, @@ -115,6 +117,7 @@ std::unique_ptr make_timestamp_column(data_type type, { CUDF_FUNC_RANGE(); CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type."); + CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique(type, size, @@ -133,6 +136,7 @@ std::unique_ptr make_duration_column(data_type type, { CUDF_FUNC_RANGE(); CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type."); + CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); return std::make_unique(type, size, @@ -166,6 +170,7 @@ std::unique_ptr make_dictionary_from_scalar(scalar const& s, rmm::mr::device_memory_resource* mr) { if (size == 0) return make_empty_column(type_id::DICTIONARY32); + CUDF_EXPECTS(size >= 0, "Column size cannot be negative."); CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key"); return make_dictionary_column( make_column_from_scalar(s, 1, stream, mr), diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu index 90252fd6cf1..c401b765f0b 100644 --- a/cpp/src/column/column_factories.cu +++ b/cpp/src/column/column_factories.cu @@ -54,21 +54,15 @@ std::unique_ptr column_from_scalar_dispatch::operator()( - value.type(), size, rmm::device_buffer{}, std::move(null_mask), size); - - // Create a strings column_view with all nulls and no children. // Since we are setting every row to the scalar, the fill() never needs to access // any of the children in the strings column which would otherwise cause an exception. - column_view sc{ - data_type{type_id::STRING}, size, nullptr, static_cast(null_mask.data()), size}; + column_view sc{value.type(), size, nullptr}; auto& sv = static_cast const&>(value); + // fill the column with the scalar auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr); - output->set_null_mask(rmm::device_buffer{}, 0); // should be no nulls + return output; } diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 2ff088a3f20..3e18b9734f6 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -68,7 +68,7 @@ size_type column_view_base::null_count() const { if (_null_count <= cudf::UNKNOWN_NULL_COUNT) { _null_count = cudf::detail::null_count( - null_mask(), offset(), offset() + size(), cudf::default_stream_value); + null_mask(), offset(), offset() + size(), cudf::get_default_stream()); } return _null_count; } @@ -79,7 +79,7 @@ size_type column_view_base::null_count(size_type begin, size_type end) const return (null_count() == 0) ? 0 : cudf::detail::null_count( - null_mask(), offset() + begin, offset() + end, cudf::default_stream_value); + null_mask(), offset() + begin, offset() + end, cudf::get_default_stream()); } // Struct to use custom hash combine and fold expression diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b770eef1c3a..577d6427b19 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -180,10 +180,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views, if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); } while (output_index < output_size) { // Lookup input index by searching for output index in offsets - // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead - auto const offset_it = - -1 + thrust::upper_bound( - thrust::seq, input_offsets, input_offsets + num_input_views, output_index); + auto const offset_it = thrust::prev(thrust::upper_bound( + thrust::seq, input_offsets, input_offsets + num_input_views, output_index)); size_type const partition_index = offset_it - input_offsets; // Copy input data to output @@ -557,7 +555,7 @@ rmm::device_buffer concatenate_masks(host_span views, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_masks(views, cudf::default_stream_value, mr); + return detail::concatenate_masks(views, cudf::get_default_stream(), mr); } // Concatenates the elements from a vector of column_views @@ -565,14 +563,14 @@ std::unique_ptr concatenate(host_span columns_to_conc rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(columns_to_concat, cudf::default_stream_value, mr); + return detail::concatenate(columns_to_concat, cudf::get_default_stream(), mr); } std::unique_ptr
concatenate(host_span tables_to_concat, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(tables_to_concat, cudf::default_stream_value, mr); + return detail::concatenate(tables_to_concat, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 0c90eb539fc..c52ca1f74df 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1269,7 +1269,7 @@ std::vector contiguous_split(cudf::table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contiguous_split(input, splits, cudf::default_stream_value, mr); + return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr); } }; // namespace cudf diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp index d9a16315488..00147277231 100644 --- a/cpp/src/copying/copy.cpp +++ b/cpp/src/copying/copy.cpp @@ -183,7 +183,7 @@ std::unique_ptr allocate_like(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::allocate_like(input, input.size(), mask_alloc, cudf::default_stream_value, mr); + return detail::allocate_like(input, input.size(), mask_alloc, cudf::get_default_stream(), mr); } std::unique_ptr allocate_like(column_view const& input, @@ -192,7 +192,7 @@ std::unique_ptr allocate_like(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::allocate_like(input, size, mask_alloc, cudf::default_stream_value, mr); + return detail::allocate_like(input, size, mask_alloc, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index 5585eac923c..0978cf441d8 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -180,7 +180,6 @@ std::unique_ptr scatter_gather_based_if_else(cudf::column_view const& lh table_view{std::vector{scatter_src_lhs->get_column(0).view()}}, gather_map, table_view{std::vector{rhs}}, - false, stream, mr); @@ -208,12 +207,8 @@ std::unique_ptr scatter_gather_based_if_else(cudf::scalar const& lhs, static_cast(scatter_map_size), scatter_map.begin()}; - auto result = cudf::detail::scatter(scatter_source, - scatter_map_column_view, - table_view{std::vector{rhs}}, - false, - stream, - mr); + auto result = cudf::detail::scatter( + scatter_source, scatter_map_column_view, table_view{std::vector{rhs}}, stream, mr); return std::move(result->release()[0]); } @@ -415,7 +410,7 @@ std::unique_ptr copy_if_else(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr); } std::unique_ptr copy_if_else(scalar const& lhs, @@ -424,7 +419,7 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr); } std::unique_ptr copy_if_else(column_view const& lhs, @@ -433,7 +428,7 @@ std::unique_ptr copy_if_else(column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr); } std::unique_ptr copy_if_else(scalar const& lhs, @@ -442,7 +437,7 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr); + return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu index 080a8f645bd..dbcae354384 100644 --- a/cpp/src/copying/copy_range.cu +++ b/cpp/src/copying/copy_range.cu @@ -172,7 +172,8 @@ std::unique_ptr out_of_place_copy_range_dispatch::operator()view()); - auto source_matched = cudf::dictionary::detail::set_keys(dict_source, target_view.keys(), stream); + auto source_matched = cudf::dictionary::detail::set_keys( + dict_source, target_view.keys(), stream, rmm::mr::get_current_device_resource()); auto const source_view = cudf::dictionary_column_view(source_matched->view()); // build the new indices by calling in_place_copy_range on just the indices @@ -274,7 +275,7 @@ void copy_range_in_place(column_view const& source, { CUDF_FUNC_RANGE(); return detail::copy_range_in_place( - source, target, source_begin, source_end, target_begin, cudf::default_stream_value); + source, target, source_begin, source_end, target_begin, cudf::get_default_stream()); } std::unique_ptr copy_range(column_view const& source, @@ -286,7 +287,7 @@ std::unique_ptr copy_range(column_view const& source, { CUDF_FUNC_RANGE(); return detail::copy_range( - source, target, source_begin, source_end, target_begin, cudf::default_stream_value, mr); + source, target, source_begin, source_end, target_begin, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu index d00d3a2a43e..93d05757722 100644 --- a/cpp/src/copying/gather.cu +++ b/cpp/src/copying/gather.cu @@ -85,7 +85,7 @@ std::unique_ptr
gather(table_view const& source_table, : detail::negative_index_policy::ALLOWED; return detail::gather( - source_table, gather_map, bounds_policy, index_policy, cudf::default_stream_value, mr); + source_table, gather_map, bounds_policy, index_policy, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index f12b4639b25..5e76b4adbbe 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -210,7 +210,7 @@ std::unique_ptr get_element(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::get_element(input, index, cudf::default_stream_value, mr); + return detail::get_element(input, index, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index 5bc425ab7f5..427f2dfdade 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -219,7 +219,7 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data) packed_columns pack(cudf::table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::pack(input, cudf::default_stream_value, mr); + return detail::pack(input, cudf::get_default_stream(), mr); } /** diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu index 30de538ec7a..5bdf10c8af6 100644 --- a/cpp/src/copying/purge_nonempty_nulls.cu +++ b/cpp/src/copying/purge_nonempty_nulls.cu @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include +#include #include #include @@ -80,6 +80,24 @@ bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view st return false; } + +std::unique_ptr purge_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // If not compound types (LIST/STRING/STRUCT/DICTIONARY) then just copy the input into output. + if (!cudf::is_compound(input.type())) { return std::make_unique(input, stream, mr); } + + // Implement via identity gather. + auto gathered_table = cudf::detail::gather(table_view{{input}}, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(gathered_table->release().front()); +} + } // namespace detail /** @@ -104,33 +122,18 @@ bool may_have_nonempty_nulls(column_view const& input) /** * @copydoc cudf::has_nonempty_nulls */ -bool has_nonempty_nulls(column_view const& input) { return detail::has_nonempty_nulls(input); } - -/** - * @copydoc cudf::purge_nonempty_nulls(lists_column_view const&, rmm::mr::device_memory_resource*) - */ -std::unique_ptr purge_nonempty_nulls(lists_column_view const& input, - rmm::mr::device_memory_resource* mr) -{ - return detail::purge_nonempty_nulls(input, cudf::default_stream_value, mr); -} - -/** - * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) - */ -std::unique_ptr purge_nonempty_nulls(structs_column_view const& input, - rmm::mr::device_memory_resource* mr) +bool has_nonempty_nulls(column_view const& input) { - return detail::purge_nonempty_nulls(input, cudf::default_stream_value, mr); + return detail::has_nonempty_nulls(input, cudf::get_default_stream()); } /** - * @copydoc cudf::purge_nonempty_nulls(strings_column_view const&, rmm::mr::device_memory_resource*) + * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::mr::device_memory_resource*) */ -std::unique_ptr purge_nonempty_nulls(strings_column_view const& input, +std::unique_ptr purge_nonempty_nulls(column_view const& input, rmm::mr::device_memory_resource* mr) { - return detail::purge_nonempty_nulls(input, cudf::default_stream_value, mr); + return detail::purge_nonempty_nulls(input, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu index a1ffa115ad1..cf8ca7d9a92 100644 --- a/cpp/src/copying/reverse.cu +++ b/cpp/src/copying/reverse.cu @@ -57,13 +57,13 @@ std::unique_ptr reverse(column_view const& source_column, std::unique_ptr
reverse(table_view const& source_table, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::reverse(source_table, cudf::default_stream_value, mr); + return detail::reverse(source_table, cudf::get_default_stream(), mr); } std::unique_ptr reverse(column_view const& source_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::reverse(source_column, cudf::default_stream_value, mr); + return detail::reverse(source_column, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu index 9a164bd053a..27a3f145caa 100644 --- a/cpp/src/copying/sample.cu +++ b/cpp/src/copying/sample.cu @@ -93,6 +93,6 @@ std::unique_ptr
sample(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sample(input, n, replacement, seed, cudf::default_stream_value, mr); + return detail::sample(input, n, replacement, seed, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 79c27816009..6083a698560 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -119,7 +119,7 @@ struct column_scalar_scatterer_impl { auto scalar_iter = thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0)); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), scalar_iter, scalar_iter + scatter_rows, scatter_iter, @@ -184,14 +184,18 @@ struct column_scalar_scatterer_impl { stream, mr); auto dict_view = dictionary_column_view(dict_target->view()); - auto scalar_index = dictionary::detail::get_index(dict_view, source.get(), stream); - auto scalar_iter = thrust::make_permutation_iterator( + auto scalar_index = dictionary::detail::get_index( + dict_view, source.get(), stream, rmm::mr::get_current_device_resource()); + auto scalar_iter = thrust::make_permutation_iterator( indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0)); auto new_indices = std::make_unique(dict_view.get_indices_annotated(), stream, mr); auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view()); - thrust::scatter( - rmm::exec_policy(stream), scalar_iter, scalar_iter + scatter_rows, scatter_iter, target_iter); + thrust::scatter(rmm::exec_policy_nosync(stream), + scalar_iter, + scalar_iter + scatter_rows, + scatter_iter, + target_iter); // build the dictionary indices column from the result auto const indices_type = new_indices->type(); @@ -285,7 +289,6 @@ struct column_scalar_scatterer_impl { std::unique_ptr
scatter(table_view const& source, column_view const& scatter_map, table_view const& target, - bool check_bounds, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -307,13 +310,12 @@ std::unique_ptr
scatter(table_view const& source, // create index type normalizing iterator for the scatter_map auto map_begin = indexalator_factory::make_input_iterator(scatter_map); auto map_end = map_begin + scatter_map.size(); - return detail::scatter(source, map_begin, map_end, target, check_bounds, stream, mr); + return detail::scatter(source, map_begin, map_end, target, stream, mr); } std::unique_ptr
scatter(table_view const& source, device_span const scatter_map, table_view const& target, - bool check_bounds, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -322,13 +324,12 @@ std::unique_ptr
scatter(table_view const& source, auto map_col = column_view(data_type{type_to_id()}, static_cast(scatter_map.size()), scatter_map.data()); - return scatter(source, map_col, target, check_bounds, stream, mr); + return scatter(source, map_col, target, stream, mr); } std::unique_ptr
scatter(std::vector> const& source, column_view const& indices, table_view const& target, - bool check_bounds, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -340,20 +341,9 @@ std::unique_ptr
scatter(std::vector> // Create normalizing iterator for indices column auto map_begin = indexalator_factory::make_input_iterator(indices); - auto map_end = map_begin + indices.size(); // Optionally check map index values are within the number of target rows. auto const n_rows = target.num_rows(); - if (check_bounds) { - CUDF_EXPECTS( - indices.size() == thrust::count_if(rmm::exec_policy(stream), - map_begin, - map_end, - [n_rows] __device__(size_type index) { - return ((index >= -n_rows) && (index < n_rows)); - }), - "Scatter map index out of bounds"); - } // Transform negative indices to index + target size auto scatter_rows = indices.size(); @@ -396,7 +386,7 @@ std::unique_ptr boolean_mask_scatter(column_view const& input, data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream); auto mutable_indices = indices->mutable_view(); - thrust::sequence(rmm::exec_policy(stream), + thrust::sequence(rmm::exec_policy_nosync(stream), mutable_indices.begin(), mutable_indices.end(), 0); @@ -404,12 +394,8 @@ std::unique_ptr boolean_mask_scatter(column_view const& input, // The scatter map is actually a table with only one column, which is scatter map. auto scatter_map = detail::apply_boolean_mask(table_view{{indices->view()}}, boolean_mask, stream); - auto output_table = detail::scatter(table_view{{input}}, - scatter_map->get_column(0).view(), - table_view{{target}}, - false, - stream, - mr); + auto output_table = detail::scatter( + table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, stream, mr); // There is only one column in output_table return std::make_unique(std::move(output_table->get_column(0))); @@ -505,21 +491,19 @@ std::unique_ptr
boolean_mask_scatter( std::unique_ptr
scatter(table_view const& source, column_view const& scatter_map, table_view const& target, - bool check_bounds, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::scatter(source, scatter_map, target, check_bounds, cudf::default_stream_value, mr); + return detail::scatter(source, scatter_map, target, cudf::get_default_stream(), mr); } std::unique_ptr
scatter(std::vector> const& source, column_view const& indices, table_view const& target, - bool check_bounds, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::scatter(source, indices, target, check_bounds, cudf::default_stream_value, mr); + return detail::scatter(source, indices, target, cudf::get_default_stream(), mr); } std::unique_ptr
boolean_mask_scatter(table_view const& input, @@ -528,7 +512,7 @@ std::unique_ptr
boolean_mask_scatter(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::default_stream_value, mr); + return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::get_default_stream(), mr); } std::unique_ptr
boolean_mask_scatter( @@ -538,7 +522,7 @@ std::unique_ptr
boolean_mask_scatter( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::default_stream_value, mr); + return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu index 607388cff56..a6126374ed2 100644 --- a/cpp/src/copying/shift.cu +++ b/cpp/src/copying/shift.cu @@ -174,7 +174,7 @@ std::unique_ptr shift(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::shift(input, offset, fill_value, cudf::default_stream_value, mr); + return detail::shift(input, offset, fill_value, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu index e329756b0df..52410ada128 100644 --- a/cpp/src/copying/slice.cu +++ b/cpp/src/copying/slice.cu @@ -114,25 +114,25 @@ std::vector slice(table_view const& input, std::vector slice(column_view const& input, host_span indices) { CUDF_FUNC_RANGE(); - return detail::slice(input, indices, cudf::default_stream_value); + return detail::slice(input, indices, cudf::get_default_stream()); } std::vector slice(table_view const& input, host_span indices) { CUDF_FUNC_RANGE(); - return detail::slice(input, indices, cudf::default_stream_value); + return detail::slice(input, indices, cudf::get_default_stream()); }; std::vector slice(column_view const& input, std::initializer_list indices) { CUDF_FUNC_RANGE(); - return detail::slice(input, indices, cudf::default_stream_value); + return detail::slice(input, indices, cudf::get_default_stream()); } std::vector slice(table_view const& input, std::initializer_list indices) { CUDF_FUNC_RANGE(); - return detail::slice(input, indices, cudf::default_stream_value); + return detail::slice(input, indices, cudf::get_default_stream()); }; } // namespace cudf diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp index 19ecd959172..b577886febf 100644 --- a/cpp/src/copying/split.cpp +++ b/cpp/src/copying/split.cpp @@ -86,26 +86,26 @@ std::vector split(cudf::column_view const& input, host_span splits) { CUDF_FUNC_RANGE(); - return detail::split(input, splits, cudf::default_stream_value); + return detail::split(input, splits, cudf::get_default_stream()); } std::vector split(cudf::table_view const& input, host_span splits) { CUDF_FUNC_RANGE(); - return detail::split(input, splits, cudf::default_stream_value); + return detail::split(input, splits, cudf::get_default_stream()); } std::vector split(column_view const& input, std::initializer_list splits) { CUDF_FUNC_RANGE(); - return detail::split(input, splits, cudf::default_stream_value); + return detail::split(input, splits, cudf::get_default_stream()); } std::vector split(table_view const& input, std::initializer_list splits) { CUDF_FUNC_RANGE(); - return detail::split(input, splits, cudf::default_stream_value); + return detail::split(input, splits, cudf::get_default_stream()); } } // namespace cudf diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ee026d6c395..db1d04259b5 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -76,9 +76,22 @@ struct extract_component_operator { if (time_since_midnight.count() < 0) { time_since_midnight += days(1); } - auto hrs_ = duration_cast(time_since_midnight); - auto mins_ = duration_cast(time_since_midnight - hrs_); - auto secs_ = duration_cast(time_since_midnight - hrs_ - mins_); + auto const hrs_ = [&] { return duration_cast(time_since_midnight); }; + auto const mins_ = [&] { return duration_cast(time_since_midnight) - hrs_(); }; + auto const secs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_(); + }; + auto const millisecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_(); + }; + auto const microsecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - + millisecs_(); + }; + auto const nanosecs_ = [&] { + return duration_cast(time_since_midnight) - hrs_() - mins_() - secs_() - + millisecs_() - microsecs_(); + }; switch (Component) { case datetime_component::YEAR: @@ -89,9 +102,12 @@ struct extract_component_operator { return static_cast(year_month_day(days_since_epoch).day()); case datetime_component::WEEKDAY: return year_month_weekday(days_since_epoch).weekday().iso_encoding(); - case datetime_component::HOUR: return hrs_.count(); - case datetime_component::MINUTE: return mins_.count(); - case datetime_component::SECOND: return secs_.count(); + case datetime_component::HOUR: return hrs_().count(); + case datetime_component::MINUTE: return mins_().count(); + case datetime_component::SECOND: return secs_().count(); + case datetime_component::MILLISECOND: return millisecs_().count(); + case datetime_component::MICROSECOND: return microsecs_().count(); + case datetime_component::NANOSECOND: return nanosecs_().count(); default: return 0; } } @@ -495,6 +511,33 @@ std::unique_ptr extract_second(column_view const& column, cudf::type_id::INT16>(column, stream, mr); } +std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + +std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return detail::apply_datetime_op< + detail::extract_component_operator, + cudf::type_id::INT16>(column, stream, mr); +} + std::unique_ptr last_day_of_month(column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -540,7 +583,7 @@ std::unique_ptr ceil_datetimes(column_view const& column, { CUDF_FUNC_RANGE(); return detail::round_general( - detail::rounding_function::CEIL, freq, column, cudf::default_stream_value, mr); + detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr); } std::unique_ptr floor_datetimes(column_view const& column, @@ -549,7 +592,7 @@ std::unique_ptr floor_datetimes(column_view const& column, { CUDF_FUNC_RANGE(); return detail::round_general( - detail::rounding_function::FLOOR, freq, column, cudf::default_stream_value, mr); + detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr); } std::unique_ptr round_datetimes(column_view const& column, @@ -558,66 +601,87 @@ std::unique_ptr round_datetimes(column_view const& column, { CUDF_FUNC_RANGE(); return detail::round_general( - detail::rounding_function::ROUND, freq, column, cudf::default_stream_value, mr); + detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr); } std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_year(column, cudf::default_stream_value, mr); + return detail::extract_year(column, cudf::get_default_stream(), mr); } std::unique_ptr extract_month(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_month(column, cudf::default_stream_value, mr); + return detail::extract_month(column, cudf::get_default_stream(), mr); } std::unique_ptr extract_day(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_day(column, cudf::default_stream_value, mr); + return detail::extract_day(column, cudf::get_default_stream(), mr); } std::unique_ptr extract_weekday(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_weekday(column, cudf::default_stream_value, mr); + return detail::extract_weekday(column, cudf::get_default_stream(), mr); } std::unique_ptr extract_hour(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_hour(column, cudf::default_stream_value, mr); + return detail::extract_hour(column, cudf::get_default_stream(), mr); } std::unique_ptr extract_minute(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_minute(column, cudf::default_stream_value, mr); + return detail::extract_minute(column, cudf::get_default_stream(), mr); } std::unique_ptr extract_second(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_second(column, cudf::default_stream_value, mr); + return detail::extract_second(column, cudf::get_default_stream(), mr); +} + +std::unique_ptr extract_millisecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr); +} + +std::unique_ptr extract_microsecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr); +} + +std::unique_ptr extract_nanosecond_fraction(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr); } std::unique_ptr last_day_of_month(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::last_day_of_month(column, cudf::default_stream_value, mr); + return detail::last_day_of_month(column, cudf::get_default_stream(), mr); } std::unique_ptr day_of_year(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::day_of_year(column, cudf::default_stream_value, mr); + return detail::day_of_year(column, cudf::get_default_stream(), mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, @@ -626,7 +690,7 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti { CUDF_FUNC_RANGE(); return detail::add_calendrical_months( - timestamp_column, months_column, cudf::default_stream_value, mr); + timestamp_column, months_column, cudf::get_default_stream(), mr); } std::unique_ptr add_calendrical_months(cudf::column_view const& timestamp_column, @@ -634,27 +698,27 @@ std::unique_ptr add_calendrical_months(cudf::column_view const& ti rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::add_calendrical_months(timestamp_column, months, cudf::default_stream_value, mr); + return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr); } std::unique_ptr is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::is_leap_year(column, cudf::default_stream_value, mr); + return detail::is_leap_year(column, cudf::get_default_stream(), mr); } std::unique_ptr days_in_month(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::days_in_month(column, cudf::default_stream_value, mr); + return detail::days_in_month(column, cudf::get_default_stream(), mr); } std::unique_ptr extract_quarter(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_quarter(column, cudf::default_stream_value, mr); + return detail::extract_quarter(column, cudf::get_default_stream(), mr); } } // namespace datetime diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu index 3dea491b6e4..486e7d2d24b 100644 --- a/cpp/src/dictionary/add_keys.cu +++ b/cpp/src/dictionary/add_keys.cu @@ -44,11 +44,10 @@ namespace detail { * d2 is now {[a, b, c, d, e, f], [5, 0, 3, 1, 2, 2, 2, 5, 0]} * ``` */ -std::unique_ptr add_keys( - dictionary_column_view const& dictionary_column, - column_view const& new_keys, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr add_keys(dictionary_column_view const& dictionary_column, + column_view const& new_keys, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls"); auto old_keys = dictionary_column.keys(); // [a,b,c,d,f] @@ -132,7 +131,7 @@ std::unique_ptr add_keys(dictionary_column_view const& dictionary_column rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::add_keys(dictionary_column, keys, cudf::default_stream_value, mr); + return detail::add_keys(dictionary_column, keys, cudf::get_default_stream(), mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu index 22e2ee578a0..01411d06b62 100644 --- a/cpp/src/dictionary/decode.cu +++ b/cpp/src/dictionary/decode.cu @@ -68,7 +68,7 @@ std::unique_ptr decode(dictionary_column_view const& source, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::decode(source, cudf::default_stream_value, mr); + return detail::decode(source, cudf::get_default_stream(), mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu index 4e8f992b633..fe8e777b694 100644 --- a/cpp/src/dictionary/encode.cu +++ b/cpp/src/dictionary/encode.cu @@ -92,7 +92,7 @@ std::unique_ptr encode(column_view const& input_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::encode(input_column, indices_type, cudf::default_stream_value, mr); + return detail::encode(input_column, indices_type, cudf::get_default_stream(), mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu index 4506ea98ca4..dcb877da686 100644 --- a/cpp/src/dictionary/remove_keys.cu +++ b/cpp/src/dictionary/remove_keys.cu @@ -56,11 +56,10 @@ namespace { * @param mr Device memory resource used to allocate the returned column's device memory. */ template -std::unique_ptr remove_keys_fn( - dictionary_column_view const& dictionary_column, - KeysKeeper keys_to_keep_fn, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr remove_keys_fn(dictionary_column_view const& dictionary_column, + KeysKeeper keys_to_keep_fn, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const keys_view = dictionary_column.keys(); auto const indices_type = dictionary_column.indices().type(); @@ -148,11 +147,10 @@ std::unique_ptr remove_keys_fn( } // namespace -std::unique_ptr remove_keys( - dictionary_column_view const& dictionary_column, - column_view const& keys_to_remove, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr remove_keys(dictionary_column_view const& dictionary_column, + column_view const& keys_to_remove, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls"); auto const keys_view = dictionary_column.keys(); @@ -166,10 +164,9 @@ std::unique_ptr remove_keys( return remove_keys_fn(dictionary_column, key_matcher, stream, mr); } -std::unique_ptr remove_unused_keys( - dictionary_column_view const& dictionary_column, - rmm::cuda_stream_view stream = cudf::default_stream_value, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr remove_unused_keys(dictionary_column_view const& dictionary_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // locate the keys to remove auto const keys_size = dictionary_column.keys_size(); @@ -200,14 +197,14 @@ std::unique_ptr remove_keys(dictionary_column_view const& dictionary_col rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::remove_keys(dictionary_column, keys_to_remove, cudf::default_stream_value, mr); + return detail::remove_keys(dictionary_column, keys_to_remove, cudf::get_default_stream(), mr); } std::unique_ptr remove_unused_keys(dictionary_column_view const& dictionary_column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::remove_unused_keys(dictionary_column, cudf::default_stream_value, mr); + return detail::remove_unused_keys(dictionary_column, cudf::get_default_stream(), mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu index 4acc2d124b2..7069993866c 100644 --- a/cpp/src/dictionary/replace.cu +++ b/cpp/src/dictionary/replace.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -123,8 +123,9 @@ std::unique_ptr replace_nulls(dictionary_column_view const& input, // first add the replacement to the keys so only the indices need to be processed auto input_matched = dictionary::detail::add_keys( input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr); - auto const input_view = dictionary_column_view(input_matched->view()); - auto const scalar_index = get_index(input_view, replacement, stream); + auto const input_view = dictionary_column_view(input_matched->view()); + auto const scalar_index = + get_index(input_view, replacement, stream, rmm::mr::get_current_device_resource()); // now build the new indices by doing replace-null on the updated indices auto const input_indices = input_view.get_indices_annotated(); diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu index 3936f7470e5..8e97a387780 100644 --- a/cpp/src/dictionary/search.cu +++ b/cpp/src/dictionary/search.cu @@ -79,7 +79,7 @@ struct find_index_fn { using ScalarType = cudf::scalar_type_t; auto find_key = static_cast(key).value(stream); auto keys_view = column_device_view::create(input.keys(), stream); - auto iter = thrust::equal_range(rmm::exec_policy(cudf::default_stream_value), + auto iter = thrust::equal_range(rmm::exec_policy(cudf::get_default_stream()), keys_view->begin(), keys_view->end(), find_key); @@ -179,7 +179,7 @@ std::unique_ptr get_index(dictionary_column_view const& dictionary, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::get_index(dictionary, key, cudf::default_stream_value, mr); + return detail::get_index(dictionary, key, cudf::get_default_stream(), mr); } } // namespace dictionary diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu index 216f00c90e1..075fb6115e3 100644 --- a/cpp/src/dictionary/set_keys.cu +++ b/cpp/src/dictionary/set_keys.cu @@ -116,11 +116,10 @@ struct dispatch_compute_indices { } // namespace // -std::unique_ptr set_keys( - dictionary_column_view const& dictionary_column, - column_view const& new_keys, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr set_keys(dictionary_column_view const& dictionary_column, + column_view const& new_keys, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls"); auto keys = dictionary_column.keys(); @@ -245,14 +244,14 @@ std::unique_ptr set_keys(dictionary_column_view const& dictionary_column rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::set_keys(dictionary_column, keys, cudf::default_stream_value, mr); + return detail::set_keys(dictionary_column, keys, cudf::get_default_stream(), mr); } std::vector> match_dictionaries( cudf::host_span input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::match_dictionaries(input, cudf::default_stream_value, mr); + return detail::match_dictionaries(input, cudf::get_default_stream(), mr); } } // namespace dictionary diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu index d4b3e209c4a..f45634a615e 100644 --- a/cpp/src/filling/calendrical_month_sequence.cu +++ b/cpp/src/filling/calendrical_month_sequence.cu @@ -43,7 +43,7 @@ std::unique_ptr calendrical_month_sequence(size_type size, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::calendrical_month_sequence(size, init, months, cudf::default_stream_value, mr); + return detail::calendrical_month_sequence(size, init, months, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu index 2abb0cf9795..dac36032583 100644 --- a/cpp/src/filling/fill.cu +++ b/cpp/src/filling/fill.cu @@ -171,7 +171,8 @@ std::unique_ptr out_of_place_fill_range_dispatch::operator()view()).get_indices_annotated(); // get the index of the key just added - auto index_of_value = cudf::dictionary::detail::get_index(target_matched->view(), value, stream); + auto index_of_value = cudf::dictionary::detail::get_index( + target_matched->view(), value, stream, rmm::mr::get_current_device_resource()); // now call fill using just the indices column and the new index auto new_indices = cudf::type_dispatcher(target_indices.type(), @@ -248,7 +249,7 @@ void fill_in_place(mutable_column_view& destination, scalar const& value) { CUDF_FUNC_RANGE(); - return detail::fill_in_place(destination, begin, end, value, cudf::default_stream_value); + return detail::fill_in_place(destination, begin, end, value, cudf::get_default_stream()); } std::unique_ptr fill(column_view const& input, @@ -258,7 +259,7 @@ std::unique_ptr fill(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::fill(input, begin, end, value, cudf::default_stream_value, mr); + return detail::fill(input, begin, end, value, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index 90c644933ec..8d86a9d9827 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -103,7 +103,6 @@ namespace cudf { namespace detail { std::unique_ptr
repeat(table_view const& input_table, column_view const& count, - bool check_count, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -112,19 +111,12 @@ std::unique_ptr
repeat(table_view const& input_table, if (input_table.num_rows() == 0) { return cudf::empty_like(input_table); } - if (check_count) { cudf::type_dispatcher(count.type(), count_checker{count}, stream); } - auto count_iter = cudf::detail::indexalator_factory::make_input_iterator(count); rmm::device_uvector offsets(count.size(), stream); thrust::inclusive_scan( rmm::exec_policy(stream), count_iter, count_iter + count.size(), offsets.begin()); - if (check_count) { - CUDF_EXPECTS(thrust::is_sorted(rmm::exec_policy(stream), offsets.begin(), offsets.end()), - "count has negative values or the resulting table has too many rows."); - } - size_type output_size{offsets.back_element(stream)}; rmm::device_uvector indices(output_size, stream); thrust::upper_bound(rmm::exec_policy(stream), @@ -162,11 +154,10 @@ std::unique_ptr
repeat(table_view const& input_table, std::unique_ptr
repeat(table_view const& input_table, column_view const& count, - bool check_count, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat(input_table, count, check_count, cudf::default_stream_value, mr); + return detail::repeat(input_table, count, cudf::get_default_stream(), mr); } std::unique_ptr
repeat(table_view const& input_table, @@ -174,7 +165,7 @@ std::unique_ptr
repeat(table_view const& input_table, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::repeat(input_table, count, cudf::default_stream_value, mr); + return detail::repeat(input_table, count, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu index a2ae3b9e70c..284e7c46347 100644 --- a/cpp/src/filling/sequence.cu +++ b/cpp/src/filling/sequence.cu @@ -154,7 +154,7 @@ std::unique_ptr sequence(size_type size, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sequence(size, init, step, cudf::default_stream_value, mr); + return detail::sequence(size, init, step, cudf::get_default_stream(), mr); } std::unique_ptr sequence(size_type size, @@ -162,7 +162,7 @@ std::unique_ptr sequence(size_type size, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sequence(size, init, cudf::default_stream_value, mr); + return detail::sequence(size, init, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index cd54e921a4c..dde0037a8c3 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -196,7 +196,7 @@ std::pair, std::vector> groupby::aggr if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); } - return dispatch_aggregation(requests, cudf::default_stream_value, mr); + return dispatch_aggregation(requests, cudf::get_default_stream(), mr); } // Compute scan requests @@ -214,13 +214,13 @@ std::pair, std::vector> groupby::scan if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); } - return sort_scan(requests, cudf::default_stream_value, mr); + return sort_scan(requests, cudf::get_default_stream(), mr); } groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto const stream = cudf::default_stream_value; + auto const stream = cudf::get_default_stream(); auto grouped_keys = helper().sorted_keys(stream, mr); auto const& group_offsets = helper().group_offsets(stream); @@ -252,7 +252,7 @@ std::pair, std::unique_ptr
> groupby::replace_nulls "Size mismatch between num_columns and replace_policies."); if (values.is_empty()) { return std::pair(empty_like(_keys), empty_like(values)); } - auto const stream = cudf::default_stream_value; + auto const stream = cudf::get_default_stream(); auto const& group_labels = helper().group_labels(stream); std::vector> results; @@ -298,7 +298,7 @@ std::pair, std::unique_ptr
> groupby::shift( [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }), "values and fill_value should have the same type."); - auto stream = cudf::default_stream_value; + auto stream = cudf::get_default_stream(); std::vector> results; auto const& group_offsets = helper().group_offsets(stream); std::transform( diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index c07833520ab..8410d499f1a 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -512,18 +512,33 @@ rmm::device_uvector extract_populated_keys(map_type const& map, { rmm::device_uvector populated_keys(num_keys, stream); - auto get_key = [] __device__(auto const& element) { return element.first; }; // first = key - auto get_key_it = thrust::make_transform_iterator(map.data(), get_key); - auto key_used = [unused = map.get_unused_key()] __device__(auto key) { return key != unused; }; - - auto end_it = thrust::copy_if(rmm::exec_policy(stream), - get_key_it, - get_key_it + map.capacity(), - populated_keys.begin(), - key_used); - - populated_keys.resize(std::distance(populated_keys.begin(), end_it), stream); + auto const get_key = [] __device__(auto const& element) { return element.first; }; // first = key + auto const key_used = [unused = map.get_unused_key()] __device__(auto key) { + return key != unused; + }; + auto key_itr = thrust::make_transform_iterator(map.data(), get_key); + + // thrust::copy_if has a bug where it cannot iterate over int-max values + // so if map.capacity() > int-max we'll call thrust::copy_if in chunks instead + auto const copy_size = + std::min(map.capacity(), static_cast(std::numeric_limits::max())); + auto const key_end = key_itr + map.capacity(); + auto pop_keys_itr = populated_keys.begin(); + + std::size_t output_size = 0; + while (key_itr != key_end) { + auto const copy_end = static_cast(std::distance(key_itr, key_end)) <= copy_size + ? key_end + : key_itr + copy_size; + auto const end_it = + thrust::copy_if(rmm::exec_policy(stream), key_itr, copy_end, pop_keys_itr, key_used); + auto const copied = std::distance(pop_keys_itr, end_it); + pop_keys_itr += copied; + output_size += copied; + key_itr = copy_end; + } + populated_keys.resize(output_size, stream); return populated_keys; } @@ -653,14 +668,6 @@ std::pair, std::vector> groupby( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const has_nested_column = - std::any_of(keys.begin(), keys.end(), [](cudf::column_view const& col) { - return cudf::is_nested(col.type()); - }); - if (has_nested_column and include_null_keys == cudf::null_policy::EXCLUDE) { - CUDF_FAIL("Null keys of nested type cannot be excluded."); - } - cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 55a0b89e446..e3d14f1deb7 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -701,7 +701,7 @@ void aggregate_result_functor::operator()(aggregation cons cache.add_result(values, agg, - cudf::detail::tdigest::group_tdigest( + cudf::tdigest::detail::group_tdigest( get_sorted_values(), helper.group_offsets(stream), helper.group_labels(stream), @@ -745,7 +745,7 @@ void aggregate_result_functor::operator()(aggregatio dynamic_cast(agg).max_centroids; cache.add_result(values, agg, - cudf::detail::tdigest::group_merge_tdigest(get_grouped_values(), + cudf::tdigest::detail::group_merge_tdigest(get_grouped_values(), helper.group_offsets(stream), helper.group_labels(stream), helper.num_groups(stream), diff --git a/cpp/src/groupby/sort/common_utils.cuh b/cpp/src/groupby/sort/common_utils.cuh new file mode 100644 index 00000000000..fe5d7c325ca --- /dev/null +++ b/cpp/src/groupby/sort/common_utils.cuh @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf::groupby::detail { +/** + * @brief Functor to compare two rows of a table in given permutation order + * + * This is useful to identify unique elements in a sorted order table, when the permutation order is + * the sorted order of the table. + */ +template +struct permuted_row_equality_comparator { + /** + * @brief Constructs a permuted comparator object which compares two rows of the table in given + * permutation order + * + * @param comparator Equality comparator + * @param permutation The permutation map that specifies the effective ordering of + * `t`. Must be the same size as `t.num_rows()` + */ + permuted_row_equality_comparator(ComparatorT const& comparator, Iterator const permutation) + : _comparator{comparator}, _permutation{permutation} + { + } + + /** + * @brief Returns true if the two rows at the specified indices in the permuted + * order are equivalent. + * + * For example, comparing rows `i` and `j` is equivalent to comparing + * rows `permutation[i]` and `permutation[j]` in the original table. + * + * @param lhs The index of the first row + * @param rhs The index of the second row + * @returns true if the two specified rows in the permuted order are equivalent + */ + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const + { + return _comparator(_permutation[lhs], _permutation[rhs]); + }; + + private: + ComparatorT const _comparator; + Iterator const _permutation; +}; +} // namespace cudf::groupby::detail diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp index 748e34a583d..bcc190c745b 100644 --- a/cpp/src/groupby/sort/functors.hpp +++ b/cpp/src/groupby/sort/functors.hpp @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once + #include #include #include diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu index b719698b6b5..c411e654913 100644 --- a/cpp/src/groupby/sort/group_nunique.cu +++ b/cpp/src/groupby/sort/group_nunique.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include @@ -33,82 +33,45 @@ namespace groupby { namespace detail { namespace { -template +template struct is_unique_iterator_fn { + using comparator_type = + typename cudf::experimental::row::equality::device_row_comparator; + Nullate nulls; column_device_view const v; - element_equality_comparator equal; + comparator_type equal; null_policy null_handling; size_type const* group_offsets; size_type const* group_labels; is_unique_iterator_fn(Nullate nulls, column_device_view const& v, + comparator_type const& equal, null_policy null_handling, size_type const* group_offsets, size_type const* group_labels) : nulls{nulls}, v{v}, - equal{nulls, v, v}, + equal{equal}, null_handling{null_handling}, group_offsets{group_offsets}, group_labels{group_labels} { } - __device__ size_type operator()(size_type i) + __device__ size_type operator()(size_type i) const { - bool is_input_countable = + auto const is_input_countable = !nulls || (null_handling == null_policy::INCLUDE || v.is_valid_nocheck(i)); - bool is_unique = is_input_countable && - (group_offsets[group_labels[i]] == i || // first element or - (not equal.template operator()(i, i - 1))); // new unique value in sorted + auto const is_unique = + is_input_countable && (group_offsets[group_labels[i]] == i || // first element or + (not equal(i, i - 1))); // new unique value in sorted return static_cast(is_unique); } }; - -struct nunique_functor { - template - std::enable_if_t(), std::unique_ptr> operator()( - column_view const& values, - cudf::device_span group_labels, - size_type const num_groups, - cudf::device_span group_offsets, - null_policy null_handling, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - auto result = make_numeric_column( - data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); - - if (num_groups == 0) { return result; } - - auto values_view = column_device_view::create(values, stream); - auto is_unique_iterator = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()}, - *values_view, - null_handling, - group_offsets.data(), - group_labels.data()}); - thrust::reduce_by_key(rmm::exec_policy(stream), - group_labels.begin(), - group_labels.end(), - is_unique_iterator, - thrust::make_discard_iterator(), - result->mutable_view().begin()); - - return result; - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - Args&&...) - { - CUDF_FAIL("list_view group_nunique not supported yet"); - } -}; } // namespace + std::unique_ptr group_nunique(column_view const& values, cudf::device_span group_labels, size_type const num_groups, @@ -121,15 +84,33 @@ std::unique_ptr group_nunique(column_view const& values, CUDF_EXPECTS(static_cast(values.size()) == group_labels.size(), "Size of values column should be same as that of group labels"); - return type_dispatcher(values.type(), - nunique_functor{}, - values, - group_labels, - num_groups, - group_offsets, - null_handling, - stream, - mr); + auto result = make_numeric_column( + data_type(type_to_id()), num_groups, mask_state::UNALLOCATED, stream, mr); + + if (num_groups == 0) { return result; } + + auto const values_view = table_view{{values}}; + auto const comparator = cudf::experimental::row::equality::self_comparator{values_view, stream}; + auto const d_equal = comparator.equal_to( + cudf::nullate::DYNAMIC{cudf::has_nested_nulls(values_view)}, null_equality::EQUAL); + + auto const d_values_view = column_device_view::create(values, stream); + auto const is_unique_iterator = + thrust::make_transform_iterator(thrust::counting_iterator(0), + is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()}, + *d_values_view, + d_equal, + null_handling, + group_offsets.data(), + group_labels.data()}); + thrust::reduce_by_key(rmm::exec_policy(stream), + group_labels.begin(), + group_labels.end(), + is_unique_iterator, + thrust::make_discard_iterator(), + result->mutable_view().begin()); + + return result; } } // namespace detail diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index cce84384ef7..149f026ffe6 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "common_utils.cuh" + #include #include #include @@ -21,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -39,36 +41,6 @@ namespace groupby { namespace detail { namespace { -/** - * @brief Functor to compare two rows of a table in given permutation order - * This is useful to identify unique elements in a sorted order table, when the permutation order is - * the sorted order of the table. - * - */ -template -struct permuted_comparator { - /** - * @brief comparator object which compares two rows of the table in given permutation order - * - * @param device_table Device table to compare - * @param permutation The permutation order, integer type column. - * @param has_nulls whether the table has nulls - */ - permuted_comparator(table_device_view device_table, Iterator const permutation, bool has_nulls) - : comparator(nullate::DYNAMIC{has_nulls}, device_table, device_table, null_equality::EQUAL), - permutation(permutation) - { - } - __device__ bool operator()(size_type index1, size_type index2) const - { - return comparator(permutation[index1], permutation[index2]); - }; - - private: - row_equality_comparator comparator; - Iterator const permutation; -}; - /** * @brief generate grouped row ranks or dense ranks using a row comparison then scan the results * @@ -99,32 +71,29 @@ std::unique_ptr rank_generator(column_view const& grouped_values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const flattened = cudf::structs::detail::flatten_nested_columns( - table_view{{grouped_values}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); - auto const d_flat_order = table_device_view::create(flattened, stream); - auto sorted_index_order = value_order.begin(); - auto comparator = permuted_comparator(*d_flat_order, sorted_index_order, has_nulls); + auto const comparator = + cudf::experimental::row::equality::self_comparator{table_view{{grouped_values}}, stream}; + auto const d_equal = comparator.equal_to(cudf::nullate::DYNAMIC{has_nulls}, null_equality::EQUAL); + auto const permuted_equal = + permuted_row_equality_comparator(d_equal, value_order.begin()); - auto ranks = make_fixed_width_column(data_type{type_to_id()}, - flattened.flattened_columns().num_rows(), - mask_state::UNALLOCATED, - stream, - mr); + auto ranks = make_fixed_width_column( + data_type{type_to_id()}, grouped_values.size(), mask_state::UNALLOCATED, stream, mr); auto mutable_ranks = ranks->mutable_view(); auto unique_identifier = [labels = group_labels.begin(), offsets = group_offsets.begin(), - comparator, + permuted_equal, resolver] __device__(size_type row_index) { auto const group_start = offsets[labels[row_index]]; if constexpr (forward) { // First value of equal values is 1. - return resolver(row_index == group_start || !comparator(row_index, row_index - 1), + return resolver(row_index == group_start || !permuted_equal(row_index, row_index - 1), row_index - group_start); } else { auto const group_end = offsets[labels[row_index] + 1]; // Last value of equal values is 1. - return resolver(row_index + 1 == group_end || !comparator(row_index, row_index + 1), + return resolver(row_index + 1 == group_end || !permuted_equal(row_index, row_index + 1), row_index - group_start); } }; diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 93d5e6c032c..58ee06fcfef 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp index 5d345273782..743ca5e8065 100644 --- a/cpp/src/groupby/sort/scan.cpp +++ b/cpp/src/groupby/sort/scan.cpp @@ -178,9 +178,9 @@ void scan_result_functor::operator()(aggregation const& agg) stream, mr); } - result = std::move(cudf::detail::scatter( - table_view{{*result}}, *gather_map, table_view{{*result}}, false, stream, mr) - ->release()[0]); + result = std::move( + cudf::detail::scatter(table_view{{*result}}, *gather_map, table_view{{*result}}, stream, mr) + ->release()[0]); if (rank_agg._null_handling == null_policy::EXCLUDE) { result->set_null_mask(cudf::detail::copy_bitmask(get_grouped_values(), stream, mr)); } diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu index a0abaf71160..2bf63cb42fc 100644 --- a/cpp/src/groupby/sort/sort_helper.cu +++ b/cpp/src/groupby/sort/sort_helper.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "common_utils.cuh" + #include #include #include @@ -26,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -44,48 +46,6 @@ #include #include -namespace { -/** - * @brief Compares two `table` rows for equality as if the table were - * ordered according to a specified permutation map. - */ -struct permuted_row_equality_comparator { - cudf::row_equality_comparator _comparator; - cudf::size_type const* _map; - - /** - * @brief Construct a permuted_row_equality_comparator. - * - * @param t The `table` whose rows will be compared - * @param map The permutation map that specifies the effective ordering of - * `t`. Must be the same size as `t.num_rows()` - */ - permuted_row_equality_comparator(cudf::table_device_view const& t, - cudf::size_type const* map, - bool nullable = true) - : _comparator(cudf::nullate::DYNAMIC{nullable}, t, t, cudf::null_equality::EQUAL), _map{map} - { - } - - /** - * @brief Returns true if the two rows at the specified indices in the permuted - * order are equivalent. - * - * For example, comparing rows `i` and `j` is - * equivalent to comparing rows `map[i]` and `map[j]` in the original table. - * - * @param lhs The index of the first row - * @param rhs The index of the second row - * @returns true if the two specified rows in the permuted order are equivalent - */ - __device__ inline bool operator()(cudf::size_type lhs, cudf::size_type rhs) - { - return _comparator(_map[lhs], _map[rhs]); - } -}; - -} // namespace - namespace cudf { namespace groupby { namespace detail { @@ -94,19 +54,13 @@ namespace sort { sort_groupby_helper::sort_groupby_helper(table_view const& keys, null_policy include_null_keys, sorted keys_pre_sorted) - : _unflattened_keys(keys), + : _keys(keys), _num_keys(-1), _keys_pre_sorted(keys_pre_sorted), _include_null_keys(include_null_keys) { using namespace cudf::structs::detail; - _flattened = flatten_nested_columns(keys, {}, {}, column_nullability::FORCE); - _keys = _flattened; - auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); }; - CUDF_EXPECTS(std::all_of(_keys.begin(), _keys.end(), is_supported_key_type), - "Unsupported groupby key type does not support equality comparison"); - // Cannot depend on caller's sorting if the column contains nulls, // and null values are to be excluded. // Re-sort the data, to filter out nulls more easily. @@ -191,16 +145,17 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets( _group_offsets = std::make_unique(num_keys(stream) + 1, stream); - auto device_input_table = table_device_view::create(_keys, stream); - auto sorted_order = key_sort_order(stream).data(); + auto const comparator = cudf::experimental::row::equality::self_comparator{_keys, stream}; + auto const d_key_equal = comparator.equal_to( + cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL); + auto const sorted_order = key_sort_order(stream).data(); decltype(_group_offsets->begin()) result_end; - result_end = thrust::unique_copy( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_keys(stream)), - _group_offsets->begin(), - permuted_row_equality_comparator(*device_input_table, sorted_order, has_nulls(_keys))); + result_end = thrust::unique_copy(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(num_keys(stream)), + _group_offsets->begin(), + permuted_row_equality_comparator(d_key_equal, sorted_order)); size_type num_groups = thrust::distance(_group_offsets->begin(), result_end); _group_offsets->set_element(num_groups, num_keys(stream), stream); @@ -244,7 +199,6 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre cudf::detail::scatter(table_view({group_labels_view}), scatter_map, table_view({temp_labels->view()}), - false, stream, rmm::mr::get_current_device_resource()); @@ -316,7 +270,7 @@ std::unique_ptr
sort_groupby_helper::unique_keys(rmm::cuda_stream_view st auto gather_map_it = thrust::make_transform_iterator( group_offsets(stream).begin(), [idx_data] __device__(size_type i) { return idx_data[i]; }); - return cudf::detail::gather(_unflattened_keys, + return cudf::detail::gather(_keys, gather_map_it, gather_map_it + num_groups(stream), out_of_bounds_policy::DONT_CHECK, @@ -327,7 +281,7 @@ std::unique_ptr
sort_groupby_helper::unique_keys(rmm::cuda_stream_view st std::unique_ptr
sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - return cudf::detail::gather(_unflattened_keys, + return cudf::detail::gather(_keys, key_sort_order(stream), cudf::out_of_bounds_policy::DONT_CHECK, cudf::detail::negative_index_policy::NOT_ALLOWED, diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index c2081c596a1..f99aabc56bf 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -159,7 +159,7 @@ class concurrent_unordered_map { * storage */ static auto create(size_type capacity, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, const mapped_type unused_element = std::numeric_limits::max(), const key_type unused_key = std::numeric_limits::max(), const Hasher& hash_function = hasher(), @@ -421,8 +421,7 @@ class concurrent_unordered_map { } } - void assign_async(const concurrent_unordered_map& other, - rmm::cuda_stream_view stream = cudf::default_stream_value) + void assign_async(const concurrent_unordered_map& other, rmm::cuda_stream_view stream) { if (other.m_capacity <= m_capacity) { m_capacity = other.m_capacity; @@ -440,7 +439,7 @@ class concurrent_unordered_map { stream.value())); } - void clear_async(rmm::cuda_stream_view stream = cudf::default_stream_value) + void clear_async(rmm::cuda_stream_view stream) { constexpr int block_size = 128; init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( @@ -455,7 +454,7 @@ class concurrent_unordered_map { } } - void prefetch(const int dev_id, rmm::cuda_stream_view stream = cudf::default_stream_value) + void prefetch(const int dev_id, rmm::cuda_stream_view stream) { cudaPointerAttributes hashtbl_values_ptr_attributes; cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); @@ -475,7 +474,7 @@ class concurrent_unordered_map { * * @param stream CUDA stream used for device memory operations and kernel launches. */ - void destroy(rmm::cuda_stream_view stream = cudf::default_stream_value) + void destroy(rmm::cuda_stream_view stream) { m_allocator.deallocate(m_hashtbl_values, m_capacity, stream); delete this; @@ -516,7 +515,7 @@ class concurrent_unordered_map { const Hasher& hash_function, const Equality& equal, const allocator_type& allocator, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream) : m_hf(hash_function), m_equal(equal), m_allocator(allocator), diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh index 2da0a4fb4bd..207f46ae543 100644 --- a/cpp/src/hash/hash_allocator.cuh +++ b/cpp/src/hash/hash_allocator.cuh @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef HASH_ALLOCATOR_CUH -#define HASH_ALLOCATOR_CUH +#pragma once #include @@ -26,42 +25,6 @@ #include #include -template -struct managed_allocator { - using value_type = T; - rmm::mr::device_memory_resource* mr = new rmm::mr::managed_memory_resource; - - managed_allocator() = default; - - template - constexpr managed_allocator(const managed_allocator&) noexcept - { - } - - T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::default_stream_value) const - { - return static_cast(mr->allocate(n * sizeof(T), stream)); - } - - void deallocate(T* p, - std::size_t n, - rmm::cuda_stream_view stream = cudf::default_stream_value) const - { - mr->deallocate(p, n * sizeof(T), stream); - } -}; - -template -bool operator==(const managed_allocator&, const managed_allocator&) -{ - return true; -} -template -bool operator!=(const managed_allocator&, const managed_allocator&) -{ - return false; -} - template struct default_allocator { using value_type = T; @@ -74,14 +37,14 @@ struct default_allocator { { } - T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::default_stream_value) const + T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::get_default_stream()) const { return static_cast(mr->allocate(n * sizeof(T), stream)); } void deallocate(T* p, std::size_t n, - rmm::cuda_stream_view stream = cudf::default_stream_value) const + rmm::cuda_stream_view stream = cudf::get_default_stream()) const { mr->deallocate(p, n * sizeof(T), stream); } @@ -97,5 +60,3 @@ bool operator!=(const default_allocator&, const default_allocator&) { return false; } - -#endif diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index e5fac1e7c2c..150017d9117 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -74,7 +74,7 @@ std::unique_ptr hash(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::hash(input, hash_function, seed, cudf::default_stream_value, mr); + return detail::hash(input, hash_function, seed, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/src/hash/helper_functions.cuh index 3b8d8528894..70fc47538c9 100644 --- a/cpp/src/hash/helper_functions.cuh +++ b/cpp/src/hash/helper_functions.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, NVIDIA CORPORATION. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef HELPER_FUNCTIONS_CUH -#define HELPER_FUNCTIONS_CUH +#pragma once #include @@ -242,5 +241,3 @@ __host__ __device__ bool operator!=(const cycle_iterator_adapter& lhs, { return !lhs.equal(rhs); } - -#endif // HELPER_FUNCTIONS_CUH diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh index c5aab78589e..d85a12c69a9 100644 --- a/cpp/src/hash/managed.cuh +++ b/cpp/src/hash/managed.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, NVIDIA CORPORATION. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,7 @@ * limitations under the License. */ -#ifndef MANAGED_CUH -#define MANAGED_CUH +#pragma once #include @@ -43,5 +42,3 @@ inline bool isPtrManaged(cudaPointerAttributes attr) return attr.isManaged; #endif } - -#endif // MANAGED_CUH diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp index 7b300924dd5..58afc8e9015 100644 --- a/cpp/src/interop/dlpack.cpp +++ b/cpp/src/interop/dlpack.cpp @@ -299,13 +299,13 @@ std::unique_ptr
from_dlpack(DLManagedTensor const* managed_tensor, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_dlpack(managed_tensor, cudf::default_stream_value, mr); + return detail::from_dlpack(managed_tensor, cudf::get_default_stream(), mr); } DLManagedTensor* to_dlpack(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::to_dlpack(input, cudf::default_stream_value, mr); + return detail::to_dlpack(input, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 86ea6f4427e..2d4501ec9f7 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -450,7 +450,7 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, { CUDF_FUNC_RANGE(); - return detail::from_arrow(input_table, cudf::default_stream_value, mr); + return detail::from_arrow(input_table, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index eeb27c2ac05..fb203e6c3c1 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -154,7 +154,7 @@ std::shared_ptr dispatch_to_arrow::operator()( auto count = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(cudf::default_stream_value), + thrust::for_each(rmm::exec_policy(cudf::get_default_stream()), count, count + input.size(), [in = input.begin(), out = buf.data()] __device__(auto in_idx) { @@ -416,7 +416,7 @@ std::shared_ptr to_arrow(table_view input, arrow::MemoryPool* ar_mr) { CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, cudf::default_stream_value, ar_mr); + return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); } } // namespace cudf diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index fd794b2e66c..fd0cbeced3a 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -31,46 +31,23 @@ #include NVCOMP_ZSTD_HEADER #endif -#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3) -#define NVCOMP_HAS_ZSTD_DECOMP 1 -#else -#define NVCOMP_HAS_ZSTD_DECOMP 0 -#endif +#define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3)) -#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 4) -#define NVCOMP_HAS_ZSTD_COMP 1 -#else -#define NVCOMP_HAS_ZSTD_COMP 0 -#endif +#define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4)) -#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3) -#define NVCOMP_HAS_DEFLATE 1 -#else -#define NVCOMP_HAS_DEFLATE 0 -#endif +#define NVCOMP_HAS_DEFLATE(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 5)) -#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION > 3) or \ - (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 3 and NVCOMP_PATCH_VERSION >= 1) -#define NVCOMP_HAS_TEMPSIZE_EX 1 -#else -#define NVCOMP_HAS_TEMPSIZE_EX 0 -#endif +#define NVCOMP_HAS_TEMPSIZE_EX(MAJOR, MINOR, PATCH) \ + (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 1)) // ZSTD is stable for nvcomp 2.3.2 or newer -#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION > 3) or \ - (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 3 and NVCOMP_PATCH_VERSION >= 2) -#define NVCOMP_ZSTD_IS_STABLE 1 -#else -#define NVCOMP_ZSTD_IS_STABLE 0 -#endif +#define NVCOMP_ZSTD_DECOMP_IS_STABLE(MAJOR, MINOR, PATCH) \ + (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 2)) // Issue https://github.com/NVIDIA/spark-rapids/issues/6614 impacts nvCOMP 2.4.0 ZSTD decompression // on compute 6.x -#if NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 4 and NVCOMP_PATCH_VERSION == 0 -#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL 1 -#else -#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL 0 -#endif +#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL(MAJOR, MINOR, PATCH) \ + (MAJOR == 2 and MINOR == 4 and PATCH == 0) namespace cudf::io::nvcomp { @@ -79,12 +56,12 @@ template std::optional batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args) { -#if NVCOMP_HAS_TEMPSIZE_EX +#if NVCOMP_HAS_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) switch (compression) { case compression_type::SNAPPY: return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward(args)...); case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_DECOMP +#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward(args)...); #else return std::nullopt; @@ -104,16 +81,18 @@ auto batched_decompress_get_temp_size(compression_type compression, Args&&... ar case compression_type::SNAPPY: return nvcompBatchedSnappyDecompressGetTempSize(std::forward(args)...); case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_DECOMP +#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) return nvcompBatchedZstdDecompressGetTempSize(std::forward(args)...); #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Decompression error: " + + nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value()); #endif case compression_type::DEFLATE: -#if NVCOMP_HAS_DEFLATE +#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) return nvcompBatchedDeflateDecompressGetTempSize(std::forward(args)...); #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Decompression error: " + + nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value()); #endif default: CUDF_FAIL("Unsupported compression type"); } @@ -127,16 +106,18 @@ auto batched_decompress_async(compression_type compression, Args&&... args) case compression_type::SNAPPY: return nvcompBatchedSnappyDecompressAsync(std::forward(args)...); case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_DECOMP +#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) return nvcompBatchedZstdDecompressAsync(std::forward(args)...); #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Decompression error: " + + nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value()); #endif case compression_type::DEFLATE: -#if NVCOMP_HAS_DEFLATE +#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) return nvcompBatchedDeflateDecompressAsync(std::forward(args)...); #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Decompression error: " + + nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value()); #endif default: CUDF_FAIL("Unsupported compression type"); } @@ -163,22 +144,6 @@ size_t batched_decompress_temp_size(compression_type compression, return temp_size; } -void check_is_zstd_enabled() -{ - CUDF_EXPECTS(NVCOMP_HAS_ZSTD_DECOMP, "nvCOMP 2.3 or newer is required for Zstandard compression"); - CUDF_EXPECTS(NVCOMP_ZSTD_IS_STABLE or cudf::io::detail::nvcomp_integration::is_all_enabled(), - "Zstandard compression is experimental, you can enable it through " - "`LIBCUDF_NVCOMP_POLICY` environment variable."); - -#if NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL - int device; - int cc_major; - CUDF_CUDA_TRY(cudaGetDevice(&device)); - CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device)); - CUDF_EXPECTS(cc_major != 6, "Zstandard decompression is disabled on Pascal GPUs"); -#endif -} - void batched_decompress(compression_type compression, device_span const> inputs, device_span const> outputs, @@ -187,8 +152,6 @@ void batched_decompress(compression_type compression, size_t max_total_uncomp_size, rmm::cuda_stream_view stream) { - if (compression == compression_type::ZSTD) { check_is_zstd_enabled(); } - auto const num_chunks = inputs.size(); // cuDF inflate inputs converted to nvcomp inputs @@ -228,20 +191,22 @@ auto batched_compress_temp_size(compression_type compression, batch_size, max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &temp_size); break; case compression_type::DEFLATE: -#if NVCOMP_HAS_DEFLATE +#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) nvcomp_status = nvcompBatchedDeflateCompressGetTempSize( batch_size, max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &temp_size); break; #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Compression error: " + + nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value()); #endif case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_COMP +#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) nvcomp_status = nvcompBatchedZstdCompressGetTempSize( batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size); break; #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Compression error: " + + nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value()); #endif default: CUDF_FAIL("Unsupported compression type"); } @@ -266,20 +231,22 @@ size_t compress_max_output_chunk_size(compression_type compression, capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size); break; case compression_type::DEFLATE: -#if NVCOMP_HAS_DEFLATE +#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize( capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size); break; #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Compression error: " + + nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value()); #endif case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_COMP +#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) status = nvcompBatchedZstdCompressGetMaxOutputChunkSize( capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size); break; #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Compression error: " + + nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value()); #endif default: CUDF_FAIL("Unsupported compression type"); } @@ -316,7 +283,7 @@ static void batched_compress_async(compression_type compression, stream.value()); break; case compression_type::DEFLATE: -#if NVCOMP_HAS_DEFLATE +#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) nvcomp_status = nvcompBatchedDeflateCompressAsync(device_uncompressed_ptrs, device_uncompressed_bytes, max_uncompressed_chunk_bytes, @@ -329,10 +296,11 @@ static void batched_compress_async(compression_type compression, stream.value()); break; #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Compression error: " + + nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value()); #endif case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_COMP +#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs, device_uncompressed_bytes, max_uncompressed_chunk_bytes, @@ -345,7 +313,8 @@ static void batched_compress_async(compression_type compression, stream.value()); break; #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Compression error: " + + nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value()); #endif default: CUDF_FAIL("Unsupported compression type"); } @@ -390,18 +359,109 @@ void batched_compress(compression_type compression, update_compression_results(actual_compressed_data_sizes, results, stream); } -bool is_compression_enabled(compression_type compression) +feature_status_parameters::feature_status_parameters() + : lib_major_version{NVCOMP_MAJOR_VERSION}, + lib_minor_version{NVCOMP_MINOR_VERSION}, + lib_patch_version{NVCOMP_PATCH_VERSION}, + are_all_integrations_enabled{detail::nvcomp_integration::is_all_enabled()}, + are_stable_integrations_enabled{detail::nvcomp_integration::is_stable_enabled()} +{ + int device; + CUDF_CUDA_TRY(cudaGetDevice(&device)); + CUDF_CUDA_TRY( + cudaDeviceGetAttribute(&compute_capability_major, cudaDevAttrComputeCapabilityMajor, device)); +} + +std::optional is_compression_disabled(compression_type compression, + feature_status_parameters params) { switch (compression) { - case compression_type::DEFLATE: - // See https://github.com/rapidsai/cudf/issues/11812 - return false; - case compression_type::SNAPPY: return detail::nvcomp_integration::is_stable_enabled(); - case compression_type::ZSTD: - return NVCOMP_HAS_ZSTD_COMP and detail::nvcomp_integration::is_all_enabled(); - default: return false; + case compression_type::DEFLATE: { + if (not NVCOMP_HAS_DEFLATE( + params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) { + return "nvCOMP 2.5 or newer is required for Deflate compression"; + } + if (not params.are_all_integrations_enabled) { + return "DEFLATE compression is experimental, you can enable it through " + "`LIBCUDF_NVCOMP_POLICY` environment variable."; + } + return std::nullopt; + } + case compression_type::SNAPPY: { + if (not params.are_stable_integrations_enabled) { + return "Snappy compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` " + "environment variable."; + } + return std::nullopt; + } + case compression_type::ZSTD: { + if (not NVCOMP_HAS_ZSTD_COMP( + params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) { + return "nvCOMP 2.4 or newer is required for Zstandard compression"; + } + if (not params.are_stable_integrations_enabled) { + return "Zstandard compression is experimental, you can enable it through " + "`LIBCUDF_NVCOMP_POLICY` environment variable."; + } + return std::nullopt; + } + default: return "Unsupported compression type"; + } + return "Unsupported compression type"; +} + +std::optional is_zstd_decomp_disabled(feature_status_parameters const& params) +{ + if (not NVCOMP_HAS_ZSTD_DECOMP( + params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) { + return "nvCOMP 2.3 or newer is required for Zstandard decompression"; + } + + if (NVCOMP_ZSTD_DECOMP_IS_STABLE( + params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) { + if (not params.are_stable_integrations_enabled) { + return "Zstandard decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` " + "environment variable."; + } + } else if (not params.are_all_integrations_enabled) { + return "Zstandard decompression is experimental, you can enable it through " + "`LIBCUDF_NVCOMP_POLICY` environment variable."; + } + + if (NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL( + params.lib_major_version, params.lib_minor_version, params.lib_patch_version) and + params.compute_capability_major == 6) { + return "Zstandard decompression is disabled on Pascal GPUs"; + } + return std::nullopt; +} + +std::optional is_decompression_disabled(compression_type compression, + feature_status_parameters params) +{ + switch (compression) { + case compression_type::DEFLATE: { + if (not NVCOMP_HAS_DEFLATE( + params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) { + return "nvCOMP 2.5 or newer is required for Deflate decompression"; + } + if (not params.are_all_integrations_enabled) { + return "DEFLATE decompression is experimental, you can enable it through " + "`LIBCUDF_NVCOMP_POLICY` environment variable."; + } + return std::nullopt; + } + case compression_type::SNAPPY: { + if (not params.are_stable_integrations_enabled) { + return "Snappy decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` " + "environment variable."; + } + return std::nullopt; + } + case compression_type::ZSTD: return is_zstd_decomp_disabled(params); + default: return "Unsupported compression type"; } - return false; + return "Unsupported compression type"; } size_t compress_input_alignment_bits(compression_type compression) @@ -430,10 +490,11 @@ std::optional compress_max_allowed_chunk_size(compression_type compressi case compression_type::DEFLATE: return 64 * 1024; case compression_type::SNAPPY: return std::nullopt; case compression_type::ZSTD: -#if NVCOMP_HAS_ZSTD_COMP +#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION) return nvcompZstdCompressionMaxAllowedChunkSize; #else - CUDF_FAIL("Unsupported compression type"); + CUDF_FAIL("Compression error: " + + nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value()); #endif default: return std::nullopt; } diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp index a13cb031163..a6bde7957c7 100644 --- a/cpp/src/io/comp/nvcomp_adapter.hpp +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -18,6 +18,8 @@ #include "gpuinflate.hpp" +#include + #include #include @@ -30,14 +32,52 @@ namespace cudf::io::nvcomp { enum class compression_type { SNAPPY, ZSTD, DEFLATE }; /** - * @brief Whether the given compression type is enabled through nvCOMP. + * @brief Set of parameters that impact whether the use nvCOMP features is enabled. + */ +struct feature_status_parameters { + int lib_major_version; + int lib_minor_version; + int lib_patch_version; + bool are_all_integrations_enabled; + bool are_stable_integrations_enabled; + int compute_capability_major; + + feature_status_parameters(); + feature_status_parameters( + int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major) + : lib_major_version{major}, + lib_minor_version{minor}, + lib_patch_version{patch}, + are_all_integrations_enabled{all_enabled}, + are_stable_integrations_enabled{stable_enabled}, + compute_capability_major{cc_major} + { + } +}; + +/** + * @brief If a compression type is disabled through nvCOMP, returns the reason as a string. + * + * Result cab depend on nvCOMP version and environment variables. + * + * @param compression Compression type + * @param params Optional parameters to query status with different configurations + * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled + */ +[[nodiscard]] std::optional is_compression_disabled( + compression_type compression, feature_status_parameters params = feature_status_parameters()); + +/** + * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string. * - * Result depends on nvCOMP version and environment variables. + * Result can depend on nvCOMP version and environment variables. * * @param compression Compression type - * @returns true if nvCOMP use is enabled; false otherwise + * @param params Optional parameters to query status with different configurations + * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled */ -[[nodiscard]] bool is_compression_enabled(compression_type compression); +[[nodiscard]] std::optional is_decompression_disabled( + compression_type compression, feature_status_parameters params = feature_status_parameters()); /** * @brief Device batch decompression of given type. diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index d669dea3115..075e9e2c965 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -134,7 +134,6 @@ std::vector get_column_names(std::vector const& header, if (header.size() <= 1) { return col_names; } std::vector first_row = header; - int num_cols = 0; bool quotation = false; for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) { @@ -163,17 +162,16 @@ std::vector get_column_names(std::vector const& header, const string new_col_name(first_row.data() + prev, col_name_len); col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar)); - - // Stop parsing when we hit the line terminator; relevant when there is - // a blank line following the header. In this case, first_row includes - // multiple line terminators at the end, as the new recStart belongs to - // a line that comes after the blank line(s) - if (!quotation && first_row[pos] == parse_opts.terminator) { break; } } else { // This is the first data row, add the automatically generated name - col_names.push_back(prefix + std::to_string(num_cols)); + col_names.push_back(prefix + std::to_string(col_names.size())); } - num_cols++; + + // Stop parsing when we hit the line terminator; relevant when there is + // a blank line following the header. In this case, first_row includes + // multiple line terminators at the end, as the new recStart belongs to + // a line that comes after the blank line(s) + if (!quotation && first_row[pos] == parse_opts.terminator) { break; } // Skip adjacent delimiters if delim_whitespace is set while (parse_opts.multi_delimiter && pos < first_row.size() && @@ -540,8 +538,7 @@ void infer_column_types(parse_options const& parse_opts, auto const& stats = column_stats[inf_col_idx++]; unsigned long long int_count_total = stats.big_int_count + stats.negative_small_int_count + stats.positive_small_int_count; - - if (stats.null_count == num_records) { + if (stats.null_count == num_records or stats.total_count() == 0) { // Entire column is NULL; allocate the smallest amount of memory column_types[col_idx] = data_type(cudf::type_id::INT8); } else if (stats.string_count > 0L) { @@ -679,32 +676,37 @@ table_with_metadata read_csv(cudf::io::datasource* source, auto const& data = data_row_offsets.first; auto const& row_offsets = data_row_offsets.second; - // Exclude the end-of-data row from number of rows with actual data - auto num_records = std::max(row_offsets.size(), 1ul) - 1; - auto column_flags = std::vector(); - auto column_names = std::vector(); - auto num_actual_columns = static_cast(reader_opts.get_names().size()); - auto num_active_columns = num_actual_columns; - - // Check if the user gave us a list of column names - if (not reader_opts.get_names().empty()) { - column_flags.resize(reader_opts.get_names().size(), - column_parse::enabled | column_parse::inferred); - column_names = reader_opts.get_names(); - } else { - column_names = get_column_names( - header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); - - num_actual_columns = num_active_columns = column_names.size(); - - column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred); - + auto const unique_use_cols_indexes = std::set(reader_opts.get_use_cols_indexes().cbegin(), + reader_opts.get_use_cols_indexes().cend()); + + auto const detected_column_names = + get_column_names(header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix()); + auto const opts_have_all_col_names = + not reader_opts.get_names().empty() and + ( + // no data to detect (the number of) columns + detected_column_names.empty() or + // number of user specified names matches what is detected + reader_opts.get_names().size() == detected_column_names.size() or + // Columns are not selected by indices; read first reader_opts.get_names().size() columns + unique_use_cols_indexes.empty()); + auto column_names = opts_have_all_col_names ? reader_opts.get_names() : detected_column_names; + + auto const num_actual_columns = static_cast(column_names.size()); + auto num_active_columns = num_actual_columns; + auto column_flags = std::vector( + num_actual_columns, column_parse::enabled | column_parse::inferred); + + // User did not pass column names to override names in the file + // Process names from the file to remove empty and duplicated strings + if (not opts_have_all_col_names) { std::vector col_loop_order(column_names.size()); auto unnamed_it = std::copy_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(column_names.size()), col_loop_order.begin(), [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); }); + // Rename empty column names to "Unnamed: col_index" std::copy_if(thrust::make_counting_iterator(0), thrust::make_counting_iterator(column_names.size()), @@ -759,24 +761,44 @@ table_with_metadata read_csv(cudf::io::datasource* source, } // User can specify which columns should be parsed - if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) { + auto const unique_use_cols_names = std::unordered_set(reader_opts.get_use_cols_names().cbegin(), + reader_opts.get_use_cols_names().cend()); + auto const is_column_selection_used = + not unique_use_cols_names.empty() or not unique_use_cols_indexes.empty(); + + // Reset flags and output column count; columns will be reactivated based on the selection options + if (is_column_selection_used) { std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled); + num_active_columns = 0; + } - for (const auto index : reader_opts.get_use_cols_indexes()) { + // Column selection via column indexes + if (not unique_use_cols_indexes.empty()) { + // Users can pass names for the selected columns only, if selecting column by their indices + auto const are_opts_col_names_used = + not reader_opts.get_names().empty() and not opts_have_all_col_names; + CUDF_EXPECTS(not are_opts_col_names_used or + reader_opts.get_names().size() == unique_use_cols_indexes.size(), + "Specify names of all columns in the file, or names of all selected columns"); + + for (auto const index : unique_use_cols_indexes) { column_flags[index] = column_parse::enabled | column_parse::inferred; + if (are_opts_col_names_used) { + column_names[index] = reader_opts.get_names()[num_active_columns]; + } + ++num_active_columns; } - num_active_columns = std::unordered_set(reader_opts.get_use_cols_indexes().begin(), - reader_opts.get_use_cols_indexes().end()) - .size(); + } - for (const auto& name : reader_opts.get_use_cols_names()) { - const auto it = std::find(column_names.begin(), column_names.end(), name); - if (it != column_names.end()) { - auto curr_it = it - column_names.begin(); - if (column_flags[curr_it] == column_parse::disabled) { - column_flags[curr_it] = column_parse::enabled | column_parse::inferred; - num_active_columns++; - } + // Column selection via column names + if (not unique_use_cols_names.empty()) { + for (auto const& name : unique_use_cols_names) { + auto const it = std::find(column_names.cbegin(), column_names.cend(), name); + CUDF_EXPECTS(it != column_names.end(), "Nonexistent column selected"); + auto const col_idx = std::distance(column_names.cbegin(), it); + if (column_flags[col_idx] == column_parse::disabled) { + column_flags[col_idx] = column_parse::enabled | column_parse::inferred; + ++num_active_columns; } } } @@ -813,6 +835,8 @@ table_with_metadata read_csv(cudf::io::datasource* source, // Return empty table rather than exception if nothing to load if (num_active_columns == 0) { return {std::make_unique
(), {}}; } + // Exclude the end-of-data row from number of rows with actual data + auto const num_records = std::max(row_offsets.size(), 1ul) - 1; auto const column_types = determine_column_types( reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 2fae7b4c75a..ed2f412f291 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -279,21 +279,21 @@ struct column_to_strings_fn { // void write_chunked_begin(data_sink* out_sink, table_view const& table, - table_metadata const* metadata, + host_span user_column_names, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { if (options.is_enabled_include_header()) { - // need to generate column names if metadata is not provided + // need to generate column names if names are not provided std::vector generated_col_names; - if (metadata == nullptr) { + if (user_column_names.empty()) { generated_col_names.resize(table.num_columns()); thrust::tabulate(generated_col_names.begin(), generated_col_names.end(), [](auto idx) { return std::to_string(idx); }); } - auto const& column_names = (metadata == nullptr) ? generated_col_names : metadata->column_names; + auto const& column_names = user_column_names.empty() ? generated_col_names : user_column_names; CUDF_EXPECTS(column_names.size() == static_cast(table.num_columns()), "Mismatch between number of column headers and table columns."); @@ -346,7 +346,6 @@ void write_chunked_begin(data_sink* out_sink, void write_chunked(data_sink* out_sink, strings_column_view const& str_column_view, - table_metadata const* metadata, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -365,8 +364,11 @@ void write_chunked(data_sink* out_sink, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); cudf::string_scalar newline{options.get_line_terminator()}; - auto p_str_col_w_nl = - cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream); + auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, + newline, + string_scalar("", false), + stream, + rmm::mr::get_current_device_resource()); strings_column_view strings_column{p_str_col_w_nl->view()}; auto total_num_bytes = strings_column.chars_size(); @@ -399,7 +401,7 @@ void write_chunked(data_sink* out_sink, void write_csv(data_sink* out_sink, table_view const& table, - table_metadata const* metadata, + host_span user_column_names, csv_writer_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -407,7 +409,7 @@ void write_csv(data_sink* out_sink, // write header: column names separated by delimiter: // (even for tables with no rows) // - write_chunked_begin(out_sink, table, metadata, options, stream, mr); + write_chunked_begin(out_sink, table, user_column_names, options, stream, mr); if (table.num_rows() > 0) { // no need to check same-size columns constraint; auto-enforced by table_view @@ -471,12 +473,14 @@ void write_csv(data_sink* out_sink, delimiter_str, options.get_na_rep(), strings::separator_on_nulls::YES, - stream); + stream, + rmm::mr::get_current_device_resource()); cudf::string_scalar narep{options.get_na_rep()}; - return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); + return cudf::strings::detail::replace_nulls( + str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource()); }(); - write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr); + write_chunked(out_sink, str_concat_col->view(), options, stream, mr); } } } diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 9502922a379..b23a3d756df 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -267,7 +267,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, StackSymbolT const empty_stack_symbol, StackSymbolT const read_symbol, std::size_t const num_symbols_out, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream = cudf::get_default_stream()) { rmm::device_buffer temp_storage{}; diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index aabaa941daf..1a5a43d2b90 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -34,6 +34,8 @@ #include #include +#include + namespace cudf { namespace io { // Returns builder for csv_reader_options @@ -156,7 +158,7 @@ table_with_metadata read_avro(avro_reader_options const& options, CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported."); - return avro::read_avro(std::move(datasources[0]), options, cudf::default_stream_value, mr); + return avro::read_avro(std::move(datasources[0]), options, cudf::get_default_stream(), mr); } compression_type infer_compression_type(compression_type compression, source_info const& info) @@ -198,7 +200,7 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor options.get_byte_range_offset(), options.get_byte_range_size_with_padding()); - return detail::json::read_json(datasources, options, cudf::default_stream_value, mr); + return detail::json::read_json(datasources, options, cudf::get_default_stream(), mr); } table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) @@ -216,7 +218,7 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_ return cudf::io::detail::csv::read_csv( // std::move(datasources[0]), options, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -231,9 +233,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc return csv::write_csv( // sinks[0].get(), options.get_table(), - options.get_metadata(), + options.get_names(), options, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -241,7 +243,7 @@ namespace detail_orc = cudf::io::detail::orc; raw_orc_statistics read_raw_orc_statistics(source_info const& src_info) { - auto stream = cudf::default_stream_value; + auto stream = cudf::get_default_stream(); // Get source to read statistics from std::unique_ptr source; if (src_info.type() == io_type::FILEPATH) { @@ -337,6 +339,40 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info) return result; } +namespace { +orc_column_schema make_orc_column_schema(host_span orc_schema, + uint32_t column_id, + std::string column_name) +{ + auto const& orc_col_schema = orc_schema[column_id]; + std::vector children; + children.reserve(orc_col_schema.subtypes.size()); + std::transform( + orc_col_schema.subtypes.cbegin(), + orc_col_schema.subtypes.cend(), + cudf::detail::make_counting_transform_iterator(0, + [&names = orc_col_schema.fieldNames](size_t i) { + return i < names.size() ? names[i] + : std::string{}; + }), + std::back_inserter(children), + [&](auto& type, auto name) { return make_orc_column_schema(orc_schema, type, name); }); + + return {std::move(column_name), orc_schema[column_id].kind, std::move(children)}; +} +}; // namespace + +orc_metadata read_orc_metadata(source_info const& src_info) +{ + auto sources = make_datasources(src_info); + + CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported."); + auto const footer = orc::metadata(sources.front().get(), cudf::detail::default_stream_value).ff; + + return {{make_orc_column_schema(footer.types, 0, "")}, + static_cast(footer.numberOfRows), + static_cast(footer.stripes.size())}; +} /** * @copydoc cudf::io::read_orc @@ -347,9 +383,9 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_ auto datasources = make_datasources(options.get_source()); auto reader = std::make_unique( - std::move(datasources), options, cudf::default_stream_value, mr); + std::move(datasources), options, cudf::get_default_stream(), mr); - return reader->read(options); + return reader->read(options, cudf::get_default_stream()); } /** @@ -365,7 +401,7 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing"); auto writer = std::make_unique( - std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, cudf::default_stream_value, mr); + std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, cudf::get_default_stream(), mr); writer->write(options.get_table()); } @@ -382,7 +418,7 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing"); writer = std::make_unique( - std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, cudf::default_stream_value, mr); + std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, cudf::get_default_stream(), mr); } /** @@ -417,7 +453,7 @@ table_with_metadata read_parquet(parquet_reader_options const& options, auto datasources = make_datasources(options.get_source()); auto reader = std::make_unique( - std::move(datasources), options, cudf::default_stream_value, mr); + std::move(datasources), options, cudf::get_default_stream(), mr); return reader->read(options); } @@ -458,13 +494,52 @@ std::unique_ptr> write_parquet(parquet_writer_options const auto sinks = make_datasinks(options.get_sink()); auto writer = std::make_unique( - std::move(sinks), options, io_detail::SingleWriteMode::YES, cudf::default_stream_value, mr); + std::move(sinks), options, io_detail::SingleWriteMode::YES, cudf::get_default_stream(), mr); writer->write(options.get_table(), options.get_partitions()); return writer->close(options.get_column_chunks_file_paths()); } +/** + * @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader + */ +chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit, + parquet_reader_options const& options, + rmm::mr::device_memory_resource* mr) + : reader{std::make_unique(chunk_read_limit, + make_datasources(options.get_source()), + options, + cudf::get_default_stream(), + mr)} +{ +} + +/** + * @copydoc cudf::io::chunked_parquet_reader::~chunked_parquet_reader + */ +chunked_parquet_reader::~chunked_parquet_reader() = default; + +/** + * @copydoc cudf::io::chunked_parquet_reader::has_next + */ +bool chunked_parquet_reader::has_next() const +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly."); + return reader->has_next(); +} + +/** + * @copydoc cudf::io::chunked_parquet_reader::read_chunk + */ +table_with_metadata chunked_parquet_reader::read_chunk() const +{ + CUDF_FUNC_RANGE(); + CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly."); + return reader->read_chunk(); +} + /** * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer */ @@ -476,7 +551,7 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co auto sinks = make_datasinks(options.get_sink()); writer = std::make_unique( - std::move(sinks), options, io_detail::SingleWriteMode::NO, cudf::default_stream_value, mr); + std::move(sinks), options, io_detail::SingleWriteMode::NO, cudf::get_default_stream(), mr); } /** diff --git a/cpp/src/io/json/experimental/byte_range_info.cu b/cpp/src/io/json/experimental/byte_range_info.cu new file mode 100644 index 00000000000..d6e30d090a5 --- /dev/null +++ b/cpp/src/io/json/experimental/byte_range_info.cu @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include + +namespace cudf::io::detail::json::experimental { + +// Extract the first character position in the string. +size_type find_first_delimiter(device_span d_data, + char const delimiter, + rmm::cuda_stream_view stream) +{ + auto const first_delimiter_position = + thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter); + return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1; +} + +} // namespace cudf::io::detail::json::experimental diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp index c0eaa43e68f..87d196131ca 100644 --- a/cpp/src/io/json/experimental/read_json.cpp +++ b/cpp/src/io/json/experimental/read_json.cpp @@ -19,27 +19,135 @@ #include #include +#include #include #include namespace cudf::io::detail::json::experimental { -std::vector ingest_raw_input(host_span> sources, - compression_type compression) +size_t sources_size(host_span> const sources, + size_t range_offset, + size_t range_size) { - auto const total_source_size = - std::accumulate(sources.begin(), sources.end(), 0ul, [](size_t sum, auto& source) { - return sum + source->size(); - }); - auto buffer = std::vector(total_source_size); + return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) { + auto const size = source->size(); + // TODO take care of 0, 0, or *, 0 case. + return sum + + (range_size == 0 or range_offset + range_size > size ? size - range_offset : range_size); + }); +} + +std::vector ingest_raw_input(host_span> const& sources, + compression_type compression, + size_t range_offset, + size_t range_size) +{ + CUDF_FUNC_RANGE(); + // Iterate through the user defined sources and read the contents into the local buffer + auto const total_source_size = sources_size(sources, range_offset, range_size); + auto buffer = std::vector(total_source_size); size_t bytes_read = 0; for (const auto& source : sources) { - bytes_read += source->host_read(0, source->size(), buffer.data() + bytes_read); + if (!source->is_empty()) { + auto data_size = (range_size != 0) ? range_size : source->size(); + auto destination = buffer.data() + bytes_read; + bytes_read += source->host_read(range_offset, data_size, destination); + } } - return (compression == compression_type::NONE) ? buffer : decompress(compression, buffer); + if (compression == compression_type::NONE) { + return buffer; + } else { + return decompress(compression, buffer); + } +} + +size_type find_first_delimiter_in_chunk(host_span> sources, + json_reader_options const& reader_opts, + char const delimiter, + rmm::cuda_stream_view stream) +{ + auto const buffer = ingest_raw_input(sources, + reader_opts.get_compression(), + reader_opts.get_byte_range_offset(), + reader_opts.get_byte_range_size()); + auto d_data = rmm::device_uvector(buffer.size(), stream); + CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data(), + buffer.data(), + buffer.size() * sizeof(decltype(buffer)::value_type), + cudaMemcpyHostToDevice, + stream.value())); + return find_first_delimiter(d_data, delimiter, stream); +} + +size_type find_first_delimiter_in_chunk(host_span buffer, + char const delimiter, + rmm::cuda_stream_view stream) +{ + auto d_data = rmm::device_uvector(buffer.size(), stream); + CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data(), + buffer.data(), + buffer.size() * sizeof(decltype(buffer)::value_type), + cudaMemcpyHostToDevice, + stream.value())); + return find_first_delimiter(d_data, delimiter, stream); +} + +bool should_load_whole_source(json_reader_options const& reader_opts) +{ + return reader_opts.get_byte_range_offset() == 0 and // + reader_opts.get_byte_range_size() == 0; +} + +/** + * @brief Get the byte range between record starts and ends starting from the given range. + * + * if get_byte_range_offset == 0, then we can skip the first delimiter search + * if get_byte_range_offset != 0, then we need to search for the first delimiter in given range. + * if not found, skip this chunk, if found, then search for first delimiter in next range until we + * find a delimiter. Use this as actual range for parsing. + * + * @param sources Data sources to read from + * @param reader_opts JSON reader options with range offset and range size + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Byte range for parsing + */ +auto get_record_range_raw_input(host_span> sources, + json_reader_options const& reader_opts, + rmm::cuda_stream_view stream) +{ + auto buffer = ingest_raw_input(sources, + reader_opts.get_compression(), + reader_opts.get_byte_range_offset(), + reader_opts.get_byte_range_size()); + if (should_load_whole_source(reader_opts)) return buffer; + auto first_delim_pos = reader_opts.get_byte_range_offset() == 0 + ? 0 + : find_first_delimiter_in_chunk(buffer, '\n', stream); + if (first_delim_pos == -1) { + return std::vector{}; + } else { + first_delim_pos = first_delim_pos + reader_opts.get_byte_range_offset(); + // Find next delimiter + decltype(first_delim_pos) next_delim_pos = -1; + auto const total_source_size = sources_size(sources, 0, 0); + auto current_offset = reader_opts.get_byte_range_offset() + reader_opts.get_byte_range_size(); + while (current_offset < total_source_size and next_delim_pos == -1) { + buffer = ingest_raw_input( + sources, reader_opts.get_compression(), current_offset, reader_opts.get_byte_range_size()); + next_delim_pos = find_first_delimiter_in_chunk(buffer, '\n', stream); + if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); } + } + if (next_delim_pos == -1) { + next_delim_pos = total_source_size; + } else { + next_delim_pos = next_delim_pos + current_offset; + } + return ingest_raw_input( + sources, reader_opts.get_compression(), first_delim_pos, next_delim_pos - first_delim_pos); + } } table_with_metadata read_json(host_span> sources, @@ -47,10 +155,14 @@ table_with_metadata read_json(host_span> sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0, - "specifying a byte range is not yet supported"); + CUDF_FUNC_RANGE(); + if (not should_load_whole_source(reader_opts)) { + CUDF_EXPECTS(reader_opts.is_enabled_lines(), + "specifying a byte range is supported only for json lines"); + } + + auto const buffer = get_record_range_raw_input(sources, reader_opts, stream); - auto const buffer = ingest_raw_input(sources, reader_opts.get_compression()); auto data = host_span(reinterpret_cast(buffer.data()), buffer.size()); try { diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/experimental/read_json.hpp index c9f74b2cc41..48e104c4254 100644 --- a/cpp/src/io/json/experimental/read_json.hpp +++ b/cpp/src/io/json/experimental/read_json.hpp @@ -33,4 +33,13 @@ table_with_metadata read_json(host_span> sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -} +size_type find_first_delimiter(device_span d_data, + char const delimiter, + rmm::cuda_stream_view stream); + +size_type find_first_delimiter_in_chunk(host_span> sources, + json_reader_options const& reader_opts, + char const delimiter, + rmm::cuda_stream_view stream); + +} // namespace cudf::io::detail::json::experimental diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index d54bb5c8ea9..0ac3efb407e 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -73,7 +73,7 @@ auto print_vec = [](auto const& cpu, auto const name, auto converter) { void print_tree(host_span input, tree_meta_t const& d_gpu_tree, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream) { print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream), "node_categories", @@ -278,11 +278,11 @@ std::vector copy_strings_to_host(device_span input, auto const scv = cudf::strings_column_view(col); auto const h_chars = cudf::detail::make_std_vector_sync( cudf::device_span(scv.chars().data(), scv.chars().size()), - cudf::default_stream_value); + cudf::get_default_stream()); auto const h_offsets = cudf::detail::make_std_vector_sync( cudf::device_span( scv.offsets().data() + scv.offset(), scv.size() + 1), - cudf::default_stream_value); + cudf::get_default_stream()); // build std::string vector from chars and offsets std::vector host_data; @@ -403,7 +403,7 @@ void make_device_json_column(device_span input, std::string name = ""; auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - name = "element"; + name = list_child_name; } else if (column_categories[parent_col_id] == NC_FN) { auto field_name_col_id = parent_col_id; parent_col_id = column_parent_ids[parent_col_id]; @@ -525,14 +525,15 @@ void make_device_json_column(device_span input, auto parent_node_id = ordered_parent_node_ids[i]; if (parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST) { // unique item - if (i == 0 || + if (i == 0 or (col_ids[i - 1] != col_ids[i] or ordered_parent_node_ids[i - 1] != parent_node_id)) { // scatter to list_offset d_columns_data[original_col_ids[parent_node_id]] .child_offsets[row_offsets[parent_node_id]] = ordered_row_offsets[i]; } // TODO: verify if this code is right. check with more test cases. - if (i == num_nodes - 1 || (col_ids[i] != col_ids[i + 1])) { + if (i == num_nodes - 1 or + (col_ids[i] != col_ids[i + 1] or ordered_parent_node_ids[i + 1] != parent_node_id)) { // last value of list child_offset is its size. d_columns_data[original_col_ids[parent_node_id]] .child_offsets[row_offsets[parent_node_id] + 1] = ordered_row_offsets[i] + 1; @@ -689,19 +690,24 @@ std::pair, std::vector> device_json_co size_type num_rows = json_col.child_offsets.size() - 1; std::vector column_names{}; column_names.emplace_back("offsets"); - column_names.emplace_back(json_col.child_columns.begin()->first); + column_names.emplace_back( + json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); // Note: json_col modified here, reuse the memory auto offsets_column = std::make_unique( data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release()); // Create children column auto [child_column, names] = - device_json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + json_col.child_columns.empty() + ? std::pair, + std::vector>{std::make_unique(), {}} + : device_json_column_to_cudf_column( + json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows, @@ -717,16 +723,13 @@ std::pair, std::vector> device_json_co } } -table_with_metadata device_parse_nested_json(host_span input, +table_with_metadata device_parse_nested_json(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - // Allocate device memory for the JSON input & copy over to device - rmm::device_uvector d_input = cudf::detail::make_device_uvector_async(input, stream); - auto gpu_tree = [&]() { // Parse the JSON and get the token stream const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream); @@ -734,7 +737,8 @@ table_with_metadata device_parse_nested_json(host_span input, return get_tree_representation(tokens_gpu, token_indices_gpu, stream); }(); // IILE used to free memory of token data. #ifdef NJP_DEBUG_PRINT - print_tree(input, gpu_tree, stream); + auto h_input = cudf::detail::make_host_vector_async(d_input, stream); + print_tree(h_input, gpu_tree, stream); #endif auto [gpu_col_id, gpu_row_offsets] = records_orient_tree_traversal(d_input, gpu_tree, stream); @@ -836,5 +840,17 @@ table_with_metadata device_parse_nested_json(host_span input, {{}, out_column_names}}; } +table_with_metadata device_parse_nested_json(host_span input, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + // Allocate device memory for the JSON input & copy over to device + rmm::device_uvector d_input = cudf::detail::make_device_uvector_async(input, stream); + + return device_parse_nested_json(device_span{d_input}, options, stream, mr); +} } // namespace detail } // namespace cudf::io::json diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index dbfcca7d37a..8b6c0f9d528 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -156,6 +156,7 @@ struct field_descriptor { cudf::size_type column; char const* value_begin; char const* value_end; + bool is_quoted; }; /** @@ -178,7 +179,10 @@ __device__ field_descriptor next_field_descriptor(const char* begin, auto const desc_pre_trim = col_map.capacity() == 0 // No key - column and begin are trivial - ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)} + ? field_descriptor{field_idx, + begin, + cudf::io::gpu::seek_field_end(begin, end, opts, true), + false} : [&]() { auto const key_range = get_next_key(begin, end, opts.quotechar); auto const key_hash = cudf::detail::MurmurHash3_32{}( @@ -189,14 +193,23 @@ __device__ field_descriptor next_field_descriptor(const char* begin, // Skip the colon between the key and the value auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1; - return field_descriptor{ - column, value_begin, cudf::io::gpu::seek_field_end(value_begin, end, opts, true)}; + return field_descriptor{column, + value_begin, + cudf::io::gpu::seek_field_end(value_begin, end, opts, true), + false}; }(); // Modify start & end to ignore whitespace and quotechars auto const trimmed_value_range = - trim_whitespaces_quotes(desc_pre_trim.value_begin, desc_pre_trim.value_end, opts.quotechar); - return {desc_pre_trim.column, trimmed_value_range.first, trimmed_value_range.second}; + trim_whitespaces(desc_pre_trim.value_begin, desc_pre_trim.value_end); + bool const is_quoted = + thrust::distance(trimmed_value_range.first, trimmed_value_range.second) >= 2 and + *trimmed_value_range.first == opts.quotechar and + *thrust::prev(trimmed_value_range.second) == opts.quotechar; + return {desc_pre_trim.column, + trimmed_value_range.first + static_cast(is_quoted), + trimmed_value_range.second - static_cast(is_quoted), + is_quoted}; } /** @@ -255,13 +268,14 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts, auto const desc = next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map); auto const value_len = static_cast(std::max(desc.value_end - desc.value_begin, 0L)); + auto const is_quoted = static_cast(desc.is_quoted); current = desc.value_end + 1; using string_index_pair = thrust::pair; - // Empty fields are not legal values - if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { + if (!serialized_trie_contains(opts.trie_na, + {desc.value_begin - is_quoted, value_len + is_quoted * 2})) { // Type dispatcher does not handle strings if (column_types[desc.column].id() == type_id::STRING) { auto str_list = static_cast(output_columns[desc.column]); @@ -345,7 +359,7 @@ __global__ void detect_data_types_kernel( atomicAdd(&column_infos[desc.column].null_count, -1); } // Don't need counts to detect strings, any field in quotes is deduced to be a string - if (*(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar) { + if (desc.is_quoted) { atomicAdd(&column_infos[desc.column].string_count, 1); continue; } diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index dbf026c351e..50755724c51 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -29,19 +29,25 @@ #include +#include + #include #include #include #include +#include #include #include #include #include #include +#include +#include #include #include #include +#include #include #include #include @@ -79,7 +85,7 @@ struct node_ranges { __device__ auto operator()(size_type i) -> thrust::tuple { // Whether a token expects to be followed by its respective end-of-* token partner - auto is_begin_of_section = [] __device__(PdaTokenT const token) { + auto const is_begin_of_section = [] __device__(PdaTokenT const token) { switch (token) { case token_t::StringBegin: case token_t::ValueBegin: @@ -88,7 +94,7 @@ struct node_ranges { }; }; // The end-of-* partner token for a given beginning-of-* token - auto end_of_partner = [] __device__(PdaTokenT const token) { + auto const end_of_partner = [] __device__(PdaTokenT const token) { switch (token) { case token_t::StringBegin: return token_t::StringEnd; case token_t::ValueBegin: return token_t::ValueEnd; @@ -98,8 +104,8 @@ struct node_ranges { }; // Includes quote char for end-of-string token or Skips the quote char for // beginning-of-field-name token - auto get_token_index = [include_quote_char = include_quote_char] __device__( - PdaTokenT const token, SymbolOffsetT const token_index) { + auto const get_token_index = [include_quote_char = include_quote_char] __device__( + PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT quote_char_size = 1; switch (token) { // Strip off quote char included for StringBegin @@ -125,6 +131,81 @@ struct node_ranges { } }; +/** + * @brief Returns stable sorted keys and its sorted order + * + * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory. + * Since the key and order is returned, using double buffer helps to avoid extra copy to user + * provided output iterator. + * + * @tparam IndexType sorted order type + * @tparam KeyType key type + * @param keys keys to sort + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return Sorted keys and indices producing that sorted order + */ +template +std::pair, rmm::device_uvector> stable_sorted_key_order( + cudf::device_span keys, rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + // Determine temporary device storage requirements + rmm::device_uvector keys_buffer1(keys.size(), stream); + rmm::device_uvector keys_buffer2(keys.size(), stream); + rmm::device_uvector order_buffer1(keys.size(), stream); + rmm::device_uvector order_buffer2(keys.size(), stream); + cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); + cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + + thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); + thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); + + cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + keys_buffer, + order_buffer, + keys.size(), + 0, + sizeof(KeyType) * 8, + stream.value()); + + return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) + : std::move(keys_buffer2), + order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) + : std::move(order_buffer2)}; +} + +/** + * @brief Propagate parent node to siblings from first sibling. + * + * @param node_levels Node levels of each node + * @param parent_node_ids parent node ids initialized for first child of each push node, + * and other siblings are initialized to -1. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +void propagate_parent_to_siblings(cudf::device_span node_levels, + cudf::device_span parent_node_ids, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); + // instead of gather, using permutation_iterator, which is ~17% faster + + thrust::inclusive_scan_by_key( + rmm::exec_policy(stream), + sorted_node_levels.begin(), + sorted_node_levels.end(), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::equal_to{}, + thrust::maximum{}); +} + // Generates a tree representation of the given tokens, token_indices. tree_meta_t get_tree_representation(device_span tokens, device_span token_indices, @@ -133,7 +214,7 @@ tree_meta_t get_tree_representation(device_span tokens, { CUDF_FUNC_RANGE(); // Whether a token does represent a node in the tree representation - auto is_node = [] __device__(PdaTokenT const token) -> bool { + auto const is_node = [] __device__(PdaTokenT const token) -> bool { switch (token) { case token_t::StructBegin: case token_t::ListBegin: @@ -146,7 +227,7 @@ tree_meta_t get_tree_representation(device_span tokens, }; // Whether the token pops from the parent node stack - auto does_pop = [] __device__(PdaTokenT const token) -> bool { + auto const does_pop = [] __device__(PdaTokenT const token) -> bool { switch (token) { case token_t::StructMemberEnd: case token_t::StructEnd: @@ -156,7 +237,7 @@ tree_meta_t get_tree_representation(device_span tokens, }; // Whether the token pushes onto the parent node stack - auto does_push = [] __device__(PdaTokenT const token) -> bool { + auto const does_push = [] __device__(PdaTokenT const token) -> bool { switch (token) { case token_t::FieldNameBegin: case token_t::StructBegin: @@ -165,55 +246,126 @@ tree_meta_t get_tree_representation(device_span tokens, }; }; - auto num_tokens = tokens.size(); - auto is_node_it = thrust::make_transform_iterator( - tokens.begin(), - [is_node] __device__(auto t) -> size_type { return static_cast(is_node(t)); }); - auto num_nodes = thrust::count_if( - rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); + // Look for ErrorBegin and report the point of error. + if (auto const error_count = + thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); + error_count > 0) { + auto const error_location = + thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); + SymbolOffsetT error_index; + CUDF_CUDA_TRY( + cudaMemcpyAsync(&error_index, + token_indices.data() + thrust::distance(tokens.begin(), error_location), + sizeof(SymbolOffsetT), + cudaMemcpyDeviceToHost, + stream.value())); + stream.synchronize(); + CUDF_FAIL("JSON Parser encountered an invalid format at location " + + std::to_string(error_index)); + } + + auto const num_tokens = tokens.size(); + auto const num_nodes = + thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node); + + // Node levels: transform_exclusive_scan, copy_if. + rmm::device_uvector node_levels(num_nodes, stream, mr); + { + rmm::device_uvector token_levels(num_tokens, stream); + auto const push_pop_it = thrust::make_transform_iterator( + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { + return does_push(token) - does_pop(token); + }); + thrust::exclusive_scan( + rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); + + auto const node_levels_end = thrust::copy_if(rmm::exec_policy(stream), + token_levels.begin(), + token_levels.end(), + tokens.begin(), + node_levels.begin(), + is_node); + CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes, + "node level count mismatch"); + } + + // Node parent ids: + // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. + rmm::device_uvector parent_node_ids(num_nodes, stream, mr); + // This block of code is generalized logical stack algorithm. TODO: make this a seperate function. + { + rmm::device_uvector node_token_ids(num_nodes, stream); + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, + tokens.begin(), + node_token_ids.begin(), + is_node); + + // previous push node_id + // if previous node is a push, then i-1 + // if previous node is FE, then i-2 (returns FB's index) + // if previous node is SMB and its previous node is a push, then i-2 + // eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent. + // else -1 + auto const first_childs_parent_token_id = [tokens_gpu = + tokens.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) { return -1; } + if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) { + return i - 1; + } else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) { + return i - 2; + } else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and + (tokens_gpu[i - 2] == token_t::StructBegin || + tokens_gpu[i - 2] == token_t::ListBegin)) { + return i - 2; + } else { + return -1; + } + }; + + thrust::transform( + rmm::exec_policy(stream), + node_token_ids.begin(), + node_token_ids.end(), + parent_node_ids.begin(), + [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__( + NodeIndexT const tid) -> NodeIndexT { + auto const pid = first_childs_parent_token_id(tid); + return pid < 0 + ? parent_node_sentinel + : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + // parent_node_sentinel is -1, useful for segmented max operation below + }); + } + // Propagate parent node to siblings from first sibling - inplace. + propagate_parent_to_siblings( + cudf::device_span{node_levels.data(), node_levels.size()}, + parent_node_ids, + stream); // Node categories: copy_if with transform. rmm::device_uvector node_categories(num_nodes, stream, mr); - auto node_categories_it = + auto const node_categories_it = thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{}); - auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream), - tokens.begin(), - tokens.begin() + num_tokens, - node_categories_it, - is_node); + auto const node_categories_end = thrust::copy_if( + rmm::exec_policy(stream), tokens.begin(), tokens.end(), node_categories_it, is_node); CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes, "node category count mismatch"); - // Node levels: transform_exclusive_scan, copy_if. - rmm::device_uvector token_levels(num_tokens, stream); - auto push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { - return does_push(token) - does_pop(token); - }); - thrust::exclusive_scan( - rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); - - rmm::device_uvector node_levels(num_nodes, stream, mr); - auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), - token_levels.begin(), - token_levels.begin() + num_tokens, - tokens.begin(), - node_levels.begin(), - is_node); - CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); - // Node ranges: copy_if with transform. rmm::device_uvector node_range_begin(num_nodes, stream, mr); rmm::device_uvector node_range_end(num_nodes, stream, mr); - auto node_range_tuple_it = + auto const node_range_tuple_it = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); // Whether the tokenizer stage should keep quote characters for string values // If the tokenizer keeps the quote characters, they may be stripped during type casting constexpr bool include_quote_char = true; - auto node_range_out_it = thrust::make_transform_output_iterator( + auto const node_range_out_it = thrust::make_transform_output_iterator( node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char}); - auto node_range_out_end = + auto const node_range_out_end = thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_tokens, @@ -223,69 +375,6 @@ tree_meta_t get_tree_representation(device_span tokens, }); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); - // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max, - // reorder, copy_if. This one is sort of logical stack. But more generalized. - // TODO: make it own function. - rmm::device_uvector parent_token_ids(num_tokens, stream); - rmm::device_uvector initial_order(num_tokens, stream); - // TODO re-write the algorithm to work only on nodes, not tokens. - - thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end()); - thrust::tabulate(rmm::exec_policy(stream), - parent_token_ids.begin(), - parent_token_ids.end(), - [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type { - return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1; - // -1, not sentinel used here because of max operation below - }); - - auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data()); - // Uses radix sort for builtin types. - thrust::stable_sort_by_key(rmm::exec_policy(stream), - token_levels.data(), - token_levels.data() + token_levels.size(), - out_pid); - - // SegmentedScan Max. - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - token_levels.data(), - token_levels.data() + token_levels.size(), - parent_token_ids.data(), - parent_token_ids.data(), - thrust::equal_to{}, - thrust::maximum{}); - - // scatter to restore the original order. - { - rmm::device_uvector temp_storage(num_tokens, stream); - thrust::scatter(rmm::exec_policy(stream), - parent_token_ids.begin(), - parent_token_ids.end(), - initial_order.begin(), - temp_storage.begin()); - thrust::copy( - rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin()); - } - - rmm::device_uvector node_ids_gpu(num_tokens, stream); - thrust::exclusive_scan( - rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); - - rmm::device_uvector parent_node_ids(num_nodes, stream, mr); - auto parent_node_ids_it = thrust::make_transform_iterator( - parent_token_ids.begin(), - [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { - return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; - }); - auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream), - parent_node_ids_it, - parent_node_ids_it + parent_token_ids.size(), - tokens.begin(), - parent_node_ids.begin(), - is_node); - CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes, - "parent node id gather mismatch"); - return {std::move(node_categories), std::move(parent_node_ids), std::move(node_levels), @@ -312,38 +401,45 @@ rmm::device_uvector hash_node_type_with_field_name(device_span>; using hash_map_type = cuco::static_map; - auto num_nodes = d_tree.node_categories.size(); + + auto const num_nodes = d_tree.node_categories.size(); + auto const num_fields = thrust::count(rmm::exec_policy(stream), + d_tree.node_categories.begin(), + d_tree.node_categories.end(), + node_t::NC_FN); constexpr size_type empty_node_index_sentinel = -1; - hash_map_type key_map{compute_hash_table_size(num_nodes), // TODO reduce oversubscription + hash_map_type key_map{compute_hash_table_size(num_fields, 40), // 40% occupancy in hash map cuco::sentinel::empty_key{empty_node_index_sentinel}, cuco::sentinel::empty_value{empty_node_index_sentinel}, hash_table_allocator_type{default_allocator{}, stream}, stream.value()}; - auto d_hasher = [d_input = d_input.data(), - node_range_begin = d_tree.node_range_begin.data(), - node_range_end = d_tree.node_range_end.data()] __device__(auto node_id) { + auto const d_hasher = [d_input = d_input.data(), + node_range_begin = d_tree.node_range_begin.data(), + node_range_end = d_tree.node_range_end.data()] __device__(auto node_id) { auto const field_name = cudf::string_view(d_input + node_range_begin[node_id], node_range_end[node_id] - node_range_begin[node_id]); return cudf::detail::default_hash{}(field_name); }; - auto d_equal = [d_input = d_input.data(), - node_range_begin = d_tree.node_range_begin.data(), - node_range_end = d_tree.node_range_end.data()] __device__(auto node_id1, - auto node_id2) { + auto const d_equal = [d_input = d_input.data(), + node_range_begin = d_tree.node_range_begin.data(), + node_range_end = d_tree.node_range_end.data()] __device__(auto node_id1, + auto node_id2) { auto const field_name1 = cudf::string_view( d_input + node_range_begin[node_id1], node_range_end[node_id1] - node_range_begin[node_id1]); auto const field_name2 = cudf::string_view( d_input + node_range_begin[node_id2], node_range_end[node_id2] - node_range_begin[node_id2]); return field_name1 == field_name2; }; - auto is_field_name_node = [node_categories = d_tree.node_categories.data()] __device__( - auto node_id) { return node_categories[node_id] == node_t::NC_FN; }; // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to // hashing) - auto iter = cudf::detail::make_counting_transform_iterator( + auto const iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); + auto const is_field_name_node = [node_categories = + d_tree.node_categories.data()] __device__(auto node_id) { + return node_categories[node_id] == node_t::NC_FN; + }; key_map.insert_if(iter, iter + num_nodes, thrust::counting_iterator(0), // stencil @@ -351,9 +447,10 @@ rmm::device_uvector hash_node_type_with_field_name(device_span size_type { - auto it = key_map.find(node_id, d_hasher, d_equal); + auto const it = key_map.find(node_id, d_hasher, d_equal); return (it == key_map.end()) ? size_type{0} : it->second.load(cuda::std::memory_order_relaxed); }; @@ -373,211 +470,225 @@ rmm::device_uvector hash_node_type_with_field_name(device_span translate_sorted_parent_node_indices( - device_span scatter_indices, +// Two level hashing algorithm +// 1. Convert node_category+fieldname to node_type. (passed as argument) +// a. Create a hashmap to hash field name and assign unique node id as values. +// b. Convert the node categories to node types. +// Node type is defined as node category enum value if it is not a field node, +// otherwise it is the unique node id assigned by the hashmap (value shifted by #NUM_CATEGORY). +// 2. Set operation on entire path of each node +// a. Create a hash map with hash of {node_level, node_type} of its node and the entire parent +// until root. +// b. While creating hashmap, transform node id to unique node ids that are inserted into the +// hash map. This mimicks set operation with hash map. This unique node ids are set ids. +// c. Return this converted set ids, which are the hash map keys/values, and unique set ids. +std::pair, rmm::device_uvector> hash_node_path( + device_span node_levels, + device_span node_type, device_span parent_node_ids, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - auto const num_nodes = scatter_indices.size(); - auto const gather_indices = cudf::detail::scatter_to_gather( - scatter_indices.begin(), scatter_indices.end(), num_nodes, stream); + auto const num_nodes = parent_node_ids.size(); + rmm::device_uvector col_id(num_nodes, stream, mr); - rmm::device_uvector parent_indices(num_nodes, stream); - // gather, except parent sentinels - thrust::transform(rmm::exec_policy(stream), - parent_node_ids.begin(), - parent_node_ids.end(), - parent_indices.begin(), - [gather_indices = gather_indices.data()] __device__(auto parent_node_id) { - return (parent_node_id == parent_node_sentinel) - ? parent_node_sentinel - : gather_indices[parent_node_id]; - }); - return parent_indices; -}; + using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; + using hash_map_type = + cuco::static_map; + + constexpr size_type empty_node_index_sentinel = -1; + hash_map_type key_map{compute_hash_table_size(num_nodes), // TODO reduce oversubscription + cuco::sentinel::empty_key{empty_node_index_sentinel}, + cuco::sentinel::empty_value{empty_node_index_sentinel}, + cuco::sentinel::erased_key{-2}, + hash_table_allocator_type{default_allocator{}, stream}, + stream.value()}; + // path compression is not used since extra writes make all map operations slow. + auto const d_hasher = [node_level = node_levels.begin(), + node_type = node_type.begin(), + parent_node_ids = parent_node_ids.begin()] __device__(auto node_id) { + auto hash = + cudf::detail::hash_combine(cudf::detail::default_hash{}(node_level[node_id]), + cudf::detail::default_hash{}(node_type[node_id])); + node_id = parent_node_ids[node_id]; + while (node_id != parent_node_sentinel) { + hash = cudf::detail::hash_combine( + hash, cudf::detail::default_hash{}(node_level[node_id])); + hash = cudf::detail::hash_combine( + hash, cudf::detail::default_hash{}(node_type[node_id])); + node_id = parent_node_ids[node_id]; + } + return hash; + }; + + rmm::device_uvector node_hash(num_nodes, stream); + thrust::tabulate(rmm::exec_policy(stream), node_hash.begin(), node_hash.end(), d_hasher); + auto const d_hashed_cache = [node_hash = node_hash.begin()] __device__(auto node_id) { + return node_hash[node_id]; + }; + + auto const d_equal = [node_level = node_levels.begin(), + node_type = node_type.begin(), + parent_node_ids = parent_node_ids.begin(), + d_hashed_cache] __device__(auto node_id1, auto node_id2) { + if (node_id1 == node_id2) return true; + if (d_hashed_cache(node_id1) != d_hashed_cache(node_id2)) return false; + auto const is_equal_level = [node_level, node_type](auto node_id1, auto node_id2) { + if (node_id1 == node_id2) return true; + return node_level[node_id1] == node_level[node_id2] and + node_type[node_id1] == node_type[node_id2]; + }; + // if both nodes have same node types at all levels, it will check until it has common parent + // or root. + while (node_id1 != parent_node_sentinel and node_id2 != parent_node_sentinel and + node_id1 != node_id2 and is_equal_level(node_id1, node_id2)) { + node_id1 = parent_node_ids[node_id1]; + node_id2 = parent_node_ids[node_id2]; + } + return node_id1 == node_id2; + }; + + // insert and convert node ids to unique set ids + auto const num_inserted = thrust::count_if( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_nodes), + [d_hashed_cache, + d_equal, + view = key_map.get_device_mutable_view(), + uq_node_id = col_id.begin()] __device__(auto node_id) mutable { + auto it = view.insert_and_find(cuco::make_pair(node_id, node_id), d_hashed_cache, d_equal); + uq_node_id[node_id] = (it.first)->first.load(cuda::std::memory_order_relaxed); + return it.second; + }); + + auto const num_columns = num_inserted; // key_map.get_size() is not updated. + rmm::device_uvector unique_keys(num_columns, stream); + key_map.retrieve_all(unique_keys.begin(), thrust::make_discard_iterator(), stream.value()); + + return {std::move(col_id), std::move(unique_keys)}; +} /** - * @brief Generates column id and parent column id for each node from the node_level sorted inputs + * @brief Generates column id and parent column id for each node * - * 4. Per-Level Processing: Propagate parent node ids for each level. - * For each level, - * a. gather col_id from previous level results. input=col_id, gather_map is parent_indices. - * b. stable sort by {parent_col_id, node_type} - * c. scan sum of unique {parent_col_id, node_type} - * d. scatter the col_id back to stable node_level order (using scatter_indices) + * 1. Generate col_id: + * a. Set operation on entire path of each node, translate each node id to set id. + * (two level hashing) + * b. gather unique set ids. + * c. sort and use binary search to generate column ids. + * d. Translate parent node ids to parent column ids. * - * pre-condition: All input arguments are stable sorted by node_level (stable in node_id order) - * post-condition: Returned column_id, parent_col_id are level sorted. - * @param node_type Unique id to identify node type, field with different name has different id. - * @param parent_indices Parent node indices in the sorted node_level order - * @param d_level_boundaries The boundaries of each level in the sorted node_level order + * All inputs and outputs are in node_id order. + * @param d_input JSON string in device memory + * @param d_tree Tree representation of the JSON * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column_id, parent_column_id */ std::pair, rmm::device_uvector> generate_column_id( - device_span node_type, // level sorted - device_span parent_indices, // level sorted - device_span d_level_boundaries, + device_span d_input, + tree_meta_t const& d_tree, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); + auto const num_nodes = d_tree.node_categories.size(); - auto const num_nodes = node_type.size(); - rmm::device_uvector col_id(num_nodes, stream, mr); - rmm::device_uvector parent_col_id(num_nodes, stream); - if (num_nodes == 0) { return {std::move(col_id), std::move(parent_col_id)}; } - rmm::device_uvector scatter_indices(num_nodes, stream); - thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); - // scatter 1 to level_boundaries alone, useful for scan later - thrust::scatter(rmm::exec_policy(stream), - thrust::make_constant_iterator(1), - thrust::make_constant_iterator(1) + d_level_boundaries.size() - 1, - d_level_boundaries.begin(), - col_id.begin()); - auto level_boundaries = cudf::detail::make_std_vector_async(d_level_boundaries, stream); - // Initialize First level node's node col_id to 0 - thrust::fill(rmm::exec_policy(stream), col_id.begin(), col_id.begin() + level_boundaries[0], 0); - // Initialize First level node's parent_col_id to parent_node_sentinel sentinel - thrust::fill(rmm::exec_policy(stream), - parent_col_id.begin(), - parent_col_id.begin() + level_boundaries[0], - parent_node_sentinel); - - // Per-level processing - auto const num_levels = level_boundaries.size(); - for (size_t level = 1; level < num_levels; level++) { - // Gather the each node's parent's column id for the nodes of the current level - thrust::gather(rmm::exec_policy(stream), - parent_indices.data() + level_boundaries[level - 1], - parent_indices.data() + level_boundaries[level], - col_id.data(), - parent_col_id.data() + level_boundaries[level - 1]); - - // To invoke Radix sort for keys {parent_col_id, node_type} instead of merge sort, - // we need to split to 2 Radix sorts. - // Secondary sort on node_type - - thrust::stable_sort_by_key( - rmm::exec_policy(stream), - node_type.data() + level_boundaries[level - 1], - node_type.data() + level_boundaries[level], - thrust::make_zip_iterator(parent_col_id.begin() + level_boundaries[level - 1], - scatter_indices.begin())); - // Primary sort on parent_col_id - thrust::stable_sort_by_key( - rmm::exec_policy(stream), - parent_col_id.begin() + level_boundaries[level - 1], - parent_col_id.begin() + level_boundaries[level], - thrust::make_zip_iterator(node_type.data() + level_boundaries[level - 1], - scatter_indices.begin())); - - auto start_it = thrust::make_zip_iterator(parent_col_id.begin() + level_boundaries[level - 1], - node_type.data() + level_boundaries[level - 1]); - auto adjacent_pair_it = thrust::make_zip_iterator(start_it - 1, start_it); - // Compares two adjacent items, beginning with the first and second item from the current level. - // Writes flags to the index of the rhs item. - // First index holds next col_id from previous level. - thrust::transform(rmm::exec_policy(stream), - adjacent_pair_it + 1, - adjacent_pair_it + level_boundaries[level] - level_boundaries[level - 1], - col_id.data() + level_boundaries[level - 1] + 1, - [] __device__(auto adjacent_pair) -> size_type { - auto const lhs = thrust::get<0>(adjacent_pair); - auto const rhs = thrust::get<1>(adjacent_pair); - return lhs != rhs ? 1 : 0; - }); - - // includes previous level last col_id to continue the index. - thrust::inclusive_scan(rmm::exec_policy(stream), - col_id.data() + level_boundaries[level - 1], - col_id.data() + level_boundaries[level] + (level != num_levels - 1), - // +1 only for not-last-levels, for next level start col_id - col_id.data() + level_boundaries[level - 1]); - - // scatter to restore original order. - auto const num_nodes_per_level = level_boundaries[level] - level_boundaries[level - 1]; - { - rmm::device_uvector tmp_col_id(num_nodes_per_level, stream); - rmm::device_uvector tmp_parent_col_id(num_nodes_per_level, stream); - thrust::scatter(rmm::exec_policy(stream), - thrust::make_zip_iterator(col_id.begin() + level_boundaries[level - 1], - parent_col_id.data() + level_boundaries[level - 1]), - thrust::make_zip_iterator(col_id.begin() + level_boundaries[level], - parent_col_id.data() + level_boundaries[level]), - scatter_indices.begin(), - thrust::make_zip_iterator(tmp_col_id.begin(), tmp_parent_col_id.begin())); - thrust::copy(rmm::exec_policy(stream), - tmp_col_id.begin(), - tmp_col_id.end(), - col_id.begin() + level_boundaries[level - 1]); - thrust::copy(rmm::exec_policy(stream), - tmp_parent_col_id.begin(), - tmp_parent_col_id.end(), - parent_col_id.begin() + level_boundaries[level - 1]); - } - thrust::sequence(rmm::exec_policy(stream), - scatter_indices.begin(), - scatter_indices.begin() + num_nodes_per_level); - } + // Two level hashing: + // one for field names -> node_type and, + // another for {node_level, node_category} + field hash for the entire path + // which is {node_level, node_type} recursively using parent_node_id + auto [col_id, unique_keys] = [&]() { + // Convert node_category + field_name to node_type. + rmm::device_uvector node_type = + hash_node_type_with_field_name(d_input, d_tree, stream); + + // hash entire path from node to root. + return hash_node_path(d_tree.node_levels, node_type, d_tree.parent_node_ids, stream, mr); + }(); + + thrust::sort(rmm::exec_policy(stream), unique_keys.begin(), unique_keys.end()); + thrust::lower_bound(rmm::exec_policy(stream), + unique_keys.begin(), + unique_keys.end(), + col_id.begin(), + col_id.end(), + col_id.begin()); + + rmm::device_uvector parent_col_id(num_nodes, stream, mr); + thrust::transform(rmm::exec_policy(stream), + d_tree.parent_node_ids.begin(), + d_tree.parent_node_ids.end(), + parent_col_id.begin(), + [col_id = col_id.begin()] __device__(auto node_id) { + return node_id >= 0 ? col_id[node_id] : parent_node_sentinel; + }); return {std::move(col_id), std::move(parent_col_id)}; } /** * @brief Computes row indices of each node in the hierarchy. - * 5. Generate row_offset. - * a. stable_sort by parent_col_id. - * b. scan_by_key {parent_col_id} (required only on nodes who's parent is list) - * c. propagate to non-list leaves from parent list node by recursion + * 2. Generate row_offset. + * a. Extract only list children + * b. stable_sort by parent_col_id. + * c. scan_by_key {parent_col_id} (done only on nodes who's parent is list) + * d. propagate to non-list leaves from parent list node by recursion * * pre-condition: - * scatter_indices is a sequence, representing node_id. * d_tree.node_categories, d_tree.parent_node_ids, parent_col_id are in order of node_id. * post-condition: row_offsets is in order of node_id. - * parent_col_id and scatter_indices are sorted by parent_col_id. (unused after this function) - * @param scatter_indices node_id + * parent_col_id is moved and reused inside this function. * @param parent_col_id parent node's column id * @param d_tree Tree representation of the JSON string * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return row_offsets */ -rmm::device_uvector compute_row_offsets(device_span scatter_indices, - rmm::device_uvector&& parent_col_id, - tree_meta_t& d_tree, +rmm::device_uvector compute_row_offsets(rmm::device_uvector&& parent_col_id, + tree_meta_t const& d_tree, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); auto const num_nodes = d_tree.node_categories.size(); - // TODO generate scatter_indices sequences here itself - thrust::stable_sort_by_key( - rmm::exec_policy(stream), parent_col_id.begin(), parent_col_id.end(), scatter_indices.begin()); + + rmm::device_uvector scatter_indices(num_nodes, stream); + thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); + + // Extract only list children. (nodes who's parent is a list/root) + auto const list_parent_end = + thrust::remove_if(rmm::exec_policy(stream), + thrust::make_zip_iterator(parent_col_id.begin(), scatter_indices.begin()), + thrust::make_zip_iterator(parent_col_id.end(), scatter_indices.end()), + d_tree.parent_node_ids.begin(), + [node_categories = d_tree.node_categories.begin()] __device__(auto pnid) { + return !(pnid == parent_node_sentinel || node_categories[pnid] == NC_LIST); + }); + auto const num_list_parent = thrust::distance( + thrust::make_zip_iterator(parent_col_id.begin(), scatter_indices.begin()), list_parent_end); + + thrust::stable_sort_by_key(rmm::exec_policy(stream), + parent_col_id.begin(), + parent_col_id.begin() + num_list_parent, + scatter_indices.begin()); + rmm::device_uvector row_offsets(num_nodes, stream, mr); // TODO is it possible to generate list child_offsets too here? - thrust::exclusive_scan_by_key( - rmm::exec_policy(stream), - parent_col_id.begin(), // TODO: is there any way to limit this to list parents alone? - parent_col_id.end(), - thrust::make_constant_iterator(1), - row_offsets.begin()); + // write only 1st child offset to parent node id child_offsets? + thrust::exclusive_scan_by_key(rmm::exec_policy(stream), + parent_col_id.begin(), + parent_col_id.begin() + num_list_parent, + thrust::make_constant_iterator(1), + row_offsets.begin()); // Using scatter instead of sort. auto& temp_storage = parent_col_id; // reuse parent_col_id as temp storage thrust::scatter(rmm::exec_policy(stream), row_offsets.begin(), - row_offsets.end(), + row_offsets.begin() + num_list_parent, scatter_indices.begin(), temp_storage.begin()); row_offsets = std::move(temp_storage); @@ -601,126 +712,37 @@ rmm::device_uvector compute_row_offsets(device_span scatte }, [node_categories = d_tree.node_categories.data(), parent_node_ids = d_tree.parent_node_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; + auto const parent_node_id = parent_node_ids[node_id]; return parent_node_id != parent_node_sentinel and !(node_categories[parent_node_id] == node_t::NC_LIST); }); return row_offsets; } -/** -@note -This algorithm assigns a unique column id to each node in the tree. -The row offset is the row index of the node in that column id. -Algorithm: -1. Convert node_category+fieldname to node_type. - a. Create a hashmap to hash field name and assign unique node id as values. - b. Convert the node categories to node types. - Node type is defined as node category enum value if it is not a field node, - otherwise it is the unique node id assigned by the hashmap (value shifted by #NUM_CATEGORY). -2. Preprocessing: Translate parent node ids after sorting by level. - a. sort by level - b. get gather map of sorted indices - c. translate parent_node_ids to new sorted indices -3. Find level boundaries. - copy_if index of first unique values of sorted levels. -4. Per-Level Processing: Propagate parent node ids for each level. - For each level, - a. gather col_id from previous level results. input=col_id, gather_map is parent_indices. - b. stable sort by {parent_col_id, node_type} - c. scan sum of unique {parent_col_id, node_type} - d. scatter the col_id back to stable node_level order (using scatter_indices) - Restore original node_id order -5. Generate row_offset. - a. stable_sort by parent_col_id. - b. scan_by_key {parent_col_id} (required only on nodes whose parent is a list) - c. propagate to non-list leaves from parent list node by recursion -**/ +// This algorithm assigns a unique column id to each node in the tree. +// The row offset is the row index of the node in that column id. +// Algorithm: +// 1. Generate col_id: +// a. Set operation on entire path of each node, translate each node id to set id. +// b. gather unique set ids. +// c. sort and use binary search to generate column ids. +// d. Translate parent node ids to parent column ids. +// 2. Generate row_offset. +// a. filter only list childs +// a. stable_sort by parent_col_id. +// b. scan_by_key {parent_col_id} (done only on nodes whose parent is a list) +// c. propagate to non-list leaves from parent list node by recursion std::tuple, rmm::device_uvector> records_orient_tree_traversal(device_span d_input, - tree_meta_t& d_tree, + tree_meta_t const& d_tree, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - // 1. Convert node_category + field_name to node_type. - - auto num_nodes = d_tree.node_categories.size(); - rmm::device_uvector node_type = - hash_node_type_with_field_name(d_input, d_tree, stream); - // TODO two-level hashing: one for field names - // and another for {node-level, node_category} + field hash for the entire path - - // 2. Preprocessing: Translate parent node ids after sorting by level. - // a. sort by level - // b. get gather map of sorted indices - // c. translate parent_node_ids to sorted indices - - rmm::device_uvector scatter_indices(num_nodes, stream); - thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); - - rmm::device_uvector parent_node_ids(d_tree.parent_node_ids, stream); // make a copy - auto out_pid = - thrust::make_zip_iterator(scatter_indices.data(), parent_node_ids.data(), node_type.data()); - // Uses cub radix sort. sort by level - thrust::stable_sort_by_key(rmm::exec_policy(stream), - d_tree.node_levels.data(), - d_tree.node_levels.data() + num_nodes, - out_pid); - - rmm::device_uvector parent_indices = - translate_sorted_parent_node_indices(scatter_indices, parent_node_ids, stream); - // TODO optimize memory usage: parent_node_ids is no longer needed - - // 3. Find level boundaries. - auto level_boundaries = [&]() { - if (d_tree.node_levels.is_empty()) return rmm::device_uvector{0, stream}; - // Already node_levels is sorted - auto max_level = d_tree.node_levels.back_element(stream); - rmm::device_uvector level_boundaries(max_level + 1, stream); - // TODO try reduce_by_key - auto level_end = - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(1), - thrust::make_counting_iterator(num_nodes + 1), - level_boundaries.begin(), - [num_nodes, node_levels = d_tree.node_levels.begin()] __device__(auto index) { - return index == num_nodes || node_levels[index] != node_levels[index - 1]; - }); - CUDF_EXPECTS(thrust::distance(level_boundaries.begin(), level_end) == max_level + 1, - "num_levels != max_level + 1"); - return level_boundaries; - }; - - // 4. Per-Level Processing: Propagate parent node ids for each level. - auto [col_id, parent_col_id] = generate_column_id(node_type, // level sorted - parent_indices, // level sorted - level_boundaries(), - stream, - mr); - - // restore original order of col_id, parent_col_id and used d_tree members - { - rmm::device_uvector tmp_col_id(num_nodes, stream); - rmm::device_uvector tmp_parent_col_id(num_nodes, stream); - rmm::device_uvector tmp_node_levels(num_nodes, stream); - thrust::scatter( - rmm::exec_policy(stream), - thrust::make_zip_iterator(col_id.begin(), parent_col_id.begin(), d_tree.node_levels.begin()), - thrust::make_zip_iterator(col_id.end(), parent_col_id.end(), d_tree.node_levels.end()), - scatter_indices.begin(), - thrust::make_zip_iterator( - tmp_col_id.begin(), tmp_parent_col_id.begin(), tmp_node_levels.begin())); - col_id = std::move(tmp_col_id); - parent_col_id = std::move(tmp_parent_col_id); - d_tree.node_levels = std::move(tmp_node_levels); - thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end()); - } + auto [new_col_id, new_parent_col_id] = generate_column_id(d_input, d_tree, stream, mr); - // 5. Generate row_offset. - auto row_offsets = - compute_row_offsets(scatter_indices, std::move(parent_col_id), d_tree, stream, mr); - return std::tuple{std::move(col_id), std::move(row_offsets)}; + auto row_offsets = compute_row_offsets(std::move(new_parent_col_id), d_tree, stream, mr); + return std::tuple{std::move(new_col_id), std::move(row_offsets)}; } } // namespace detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 10d209b2ea6..35c09c89d8b 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -104,6 +104,9 @@ enum node_t : NodeT { */ enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown }; +// Default name for a list's child column +constexpr auto list_child_name{"element"}; + /** * @brief Intermediate representation of data from a nested JSON input */ @@ -319,7 +322,7 @@ tree_meta_t get_tree_representation( std::tuple, rmm::device_uvector> records_orient_tree_traversal( device_span d_input, - tree_meta_t& d_tree, + tree_meta_t const& d_tree, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5d60a564b9b..0c35930c2e4 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1162,9 +1162,6 @@ void make_json_column(json_column& root_column, // Range of encapsulating function that parses to internal columnar data representation CUDF_FUNC_RANGE(); - // Default name for a list's child column - std::string const list_child_name = "element"; - // Parse the JSON and get the token stream const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); @@ -1286,7 +1283,7 @@ void make_json_column(json_column& root_column, * (b) a list, the selected child column corresponds to single child column of * the list column. In this case, the child column may not exist yet. */ - auto get_selected_column = [&list_child_name](std::stack& current_data_path) { + auto get_selected_column = [](std::stack& current_data_path) { json_column* selected_col = current_data_path.top().current_selected_col; // If the node does not have a selected column yet @@ -1543,7 +1540,7 @@ auto parsing_options(cudf::io::json_reader_options const& options) { auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; - auto const stream = cudf::default_stream_value; + auto const stream = cudf::get_default_stream(); parse_opts.dayfirst = options.is_enabled_dayfirst(); parse_opts.keepquotes = options.is_enabled_keep_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); @@ -1680,7 +1677,8 @@ std::pair, std::vector> json_column_to size_type num_rows = json_col.child_offsets.size(); std::vector column_names{}; column_names.emplace_back("offsets"); - column_names.emplace_back(json_col.child_columns.begin()->first); + column_names.emplace_back( + json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); rmm::device_uvector d_offsets = cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr); @@ -1688,12 +1686,15 @@ std::pair, std::vector> json_column_to std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column auto [child_column, names] = - json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + json_col.child_columns.empty() + ? std::pair, + std::vector>{std::make_unique(), {}} + : json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu index 48b2af81fcd..4bbe91b61d2 100644 --- a/cpp/src/io/json/reader_impl.cu +++ b/cpp/src/io/json/reader_impl.cu @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -222,6 +223,7 @@ std::vector ingest_raw_input(std::vector> c size_t range_size, size_t range_size_padded) { + CUDF_FUNC_RANGE(); // Iterate through the user defined sources and read the contents into the local buffer size_t total_source_size = 0; for (const auto& source : sources) { @@ -313,6 +315,7 @@ rmm::device_uvector upload_data_to_device(json_reader_options const& reade rmm::device_uvector& rec_starts, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); size_t end_offset = h_data.size(); // Trim lines that are outside range @@ -592,6 +595,7 @@ table_with_metadata read_json(std::vector>& sources, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_FUNC_RANGE(); if (reader_opts.is_enabled_experimental()) { return experimental::read_json(sources, reader_opts, stream, mr); } diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 0b5de26adfc..898df3ef0f9 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "orc_common.hpp" #include "orc_gpu.hpp" +#include #include #include diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 2018024f566..44882b71925 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -16,11 +16,10 @@ #pragma once -#include "orc_common.hpp" - #include #include #include +#include #include #include @@ -37,6 +36,9 @@ namespace cudf { namespace io { namespace orc { + +static constexpr uint32_t block_header_size = 3; + struct PostScript { uint64_t footerLength = 0; // the length of the footer section in bytes CompressionKind compression = NONE; // the kind of generic compression used diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index c7a7a423cf2..1e4e36ee91c 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -19,8 +19,8 @@ #include "timezone.cuh" #include "orc.hpp" -#include "orc_common.hpp" +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 7fb83b2a24e..0623e35741d 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -379,8 +379,10 @@ rmm::device_buffer reader::impl::decompress_stripe_data( device_span> inflate_out_view{inflate_out.data(), num_compressed_blocks}; switch (decompressor.compression()) { case compression_type::ZLIB: - // See https://github.com/rapidsai/cudf/issues/11812 - if (false) { + if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) { + gpuinflate( + inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream); + } else { nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE, inflate_in_view, inflate_out_view, @@ -388,13 +390,12 @@ rmm::device_buffer reader::impl::decompress_stripe_data( max_uncomp_block_size, total_decomp_size, stream); - } else { - gpuinflate( - inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream); } break; case compression_type::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { + if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) { + gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream); + } else { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, inflate_in_view, inflate_out_view, @@ -402,11 +403,13 @@ rmm::device_buffer reader::impl::decompress_stripe_data( max_uncomp_block_size, total_decomp_size, stream); - } else { - gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream); } break; case compression_type::ZSTD: + if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD); + reason) { + CUDF_FAIL("Decompression error: " + reason.value()); + } nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, inflate_in_view, inflate_out_view, @@ -522,8 +525,8 @@ void update_null_mask(cudf::detail::hostdevice_2dvector& chunks parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr); auto merged_mask = static_cast(merged_null_mask.data()); uint32_t* dst_idx_ptr = dst_idx.data(); - // Copy child valid bits from child column to valid indexes, this will merge both child and - // parent null masks + // Copy child valid bits from child column to valid indexes, this will merge both child + // and parent null masks thrust::for_each(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + dst_idx.size(), @@ -964,7 +967,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Association between each ORC column and its cudf::column _col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1); std::vector nested_col; - bool is_data_empty = false; // Get a list of column data types std::vector column_types; @@ -988,7 +990,6 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Map each ORC column to its column _col_meta.orc_col_map[level][col.id] = column_types.size() - 1; - // TODO: Once MAP type is supported in cuDF, update this for MAP as well if (col_type == type_id::LIST or col_type == type_id::STRUCT) nested_col.emplace_back(col); } @@ -1048,6 +1049,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, size_t num_rowgroups = 0; int stripe_idx = 0; + bool is_level_data_empty = true; std::vector, size_t>> read_tasks; for (auto const& stripe_source_mapping : selected_stripes) { // Iterate through the source files selected stripes @@ -1067,21 +1069,16 @@ table_with_metadata reader::impl::read(size_type skip_rows, stream_info, level == 0); - if (total_data_size == 0) { - CUDF_EXPECTS(stripe_info->indexLength == 0, "Invalid index rowgroup stream data"); - // In case ROW GROUP INDEX is not present and all columns are structs with no null - // stream, there is nothing to read at this level. - auto fn_check_dtype = [](auto dtype) { return dtype.id() == type_id::STRUCT; }; - CUDF_EXPECTS(std::all_of(column_types.begin(), column_types.end(), fn_check_dtype), - "Expected streams data within stripe"); - is_data_empty = true; - } + auto const is_stripe_data_empty = total_data_size == 0; + if (not is_stripe_data_empty) { is_level_data_empty = false; } + CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0, + "Invalid index rowgroup stream data"); stripe_data.emplace_back(total_data_size, stream); auto dst_base = static_cast(stripe_data.back().data()); // Coalesce consecutive streams into one read - while (not is_data_empty and stream_count < stream_info.size()) { + while (not is_stripe_data_empty and stream_count < stream_info.size()) { const auto d_dst = dst_base + stream_info[stream_count].dst_pos; const auto offset = stream_info[stream_count].offset; auto len = stream_info[stream_count].length; @@ -1159,7 +1156,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); } - if (not is_data_empty) { + if (not is_stripe_data_empty) { for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) { chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos; } @@ -1196,7 +1193,8 @@ table_with_metadata reader::impl::read(size_type skip_rows, }); } // Setup row group descriptors if using indexes - if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) { + if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and + not is_level_data_empty) { auto decomp_data = decompress_stripe_data(chunks, stripe_data, *_metadata.per_file_metadata[0].decompressor, @@ -1239,7 +1237,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr); } - if (not is_data_empty) { + if (not is_level_data_empty) { decode_stream_data(chunks, num_dict_entries, skip_rows, @@ -1253,7 +1251,7 @@ table_with_metadata reader::impl::read(size_type skip_rows, // Extract information to process nested child columns if (nested_col.size()) { - if (not is_data_empty) { + if (not is_level_data_empty) { scan_null_counts(chunks, null_count_prefix_sums[level], stream); } row_groups.device_to_host(stream, true); diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index bbff689082e..1303dd126ef 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "orc_common.hpp" #include "orc_gpu.hpp" +#include #include #include diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index c9cc0f04b3c..bf883986c84 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -14,12 +14,13 @@ * limitations under the License. */ -#include +#include "orc_gpu.hpp" + +#include #include -#include -#include "orc_common.hpp" -#include "orc_gpu.hpp" +#include +#include namespace cudf { namespace io { diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index ef4bdd421fb..9032e3d2502 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "orc_common.hpp" #include "orc_gpu.hpp" +#include #include #include #include @@ -1179,8 +1179,9 @@ __global__ void __launch_bounds__(256) num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1; for (uint32_t b = t; b < num_blocks; b += 256) { uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); - inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; - auto const dst_offset = b * (padded_block_header_size + padded_comp_block_size); + inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; + auto const dst_offset = + padded_block_header_size + b * (padded_block_header_size + padded_comp_block_size); outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size}; results[ss.first_block + b] = {0, compression_status::FAILURE}; } @@ -1234,7 +1235,9 @@ __global__ void __launch_bounds__(1024) ? results[ss.first_block + b].bytes_written : src_len; uint32_t blk_size24{}; - if (results[ss.first_block + b].status == compression_status::SUCCESS) { + // Only use the compressed block if it's smaller than the uncompressed + // If compression failed, dst_len == src_len, so the uncompressed block will be used + if (src_len < dst_len) { // Copy from uncompressed source src = inputs[ss.first_block + b].data(); results[ss.first_block + b].bytes_written = src_len; @@ -1332,11 +1335,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data, if (compression == SNAPPY) { try { - if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) { + if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { + gpu_snap(comp_in, comp_out, comp_res, stream); + } else { nvcomp::batched_compress( nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); - } else { - gpu_snap(comp_in, comp_out, comp_res, stream); } } catch (...) { // There was an error in compressing so set an error status for each block @@ -1348,12 +1351,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data, // Since SNAPPY is the default compression (may not be explicitly requested), fall back to // writing without compression } - } else if (compression == ZLIB and - nvcomp::is_compression_enabled(nvcomp::compression_type::DEFLATE)) { + } else if (compression == ZLIB) { + if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE); + reason) { + CUDF_FAIL("Compression error: " + reason.value()); + } nvcomp::batched_compress( nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream); - } else if (compression == ZSTD and - nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) { + } else if (compression == ZSTD) { + if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); + reason) { + CUDF_FAIL("Compression error: " + reason.value()); + } nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); } else if (compression != NONE) { CUDF_FAIL("Unsupported compression type"); diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index bd65089810e..381a734021c 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "orc_common.hpp" #include "orc_gpu.hpp" +#include #include #include diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh index 2eb20af7898..9b98aa13bac 100644 --- a/cpp/src/io/orc/timezone.cuh +++ b/cpp/src/io/orc/timezone.cuh @@ -115,7 +115,7 @@ class timezone_table { public: // Safe to use the default stream, device_uvectors will not change after they are created empty - timezone_table() : ttimes{0, cudf::default_stream_value}, offsets{0, cudf::default_stream_value} + timezone_table() : ttimes{0, cudf::get_default_stream()}, offsets{0, cudf::get_default_stream()} { } timezone_table(int32_t gmt_offset, diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index a5e9e9da4cb..c0ae58a64d9 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -118,9 +118,9 @@ constexpr size_t compression_block_size(orc::CompressionKind compression) if (compression == orc::CompressionKind::NONE) { return 0; } auto const ncomp_type = to_nvcomp_compression_type(compression); - auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type) - ? nvcomp::compress_max_allowed_chunk_size(ncomp_type) - : std::nullopt; + auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) + ? std::nullopt + : nvcomp::compress_max_allowed_chunk_size(ncomp_type); constexpr size_t max_block_size = 256 * 1024; return std::min(nvcomp_limit.value_or(max_block_size), max_block_size); @@ -537,7 +537,7 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count) auto uncomp_block_alignment(CompressionKind compression_kind) { if (compression_kind == NONE or - not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) { + nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) { return 1u; } @@ -547,7 +547,7 @@ auto uncomp_block_alignment(CompressionKind compression_kind) auto comp_block_alignment(CompressionKind compression_kind) { if (compression_kind == NONE or - not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) { + nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) { return 1u; } @@ -2161,7 +2161,8 @@ void writer::impl::write(table_view const& table) auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); - auto const uncomp_block_align = uncomp_block_alignment(compression_kind_); + auto const uncompressed_block_align = uncomp_block_alignment(compression_kind_); + auto const compressed_block_align = comp_block_alignment(compression_kind_); auto streams = create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes)); auto enc_data = encode_columns(orc_table, @@ -2169,7 +2170,7 @@ void writer::impl::write(table_view const& table) std::move(dec_chunk_sizes), segmentation, streams, - uncomp_block_align, + uncompressed_block_align, stream); // Assemble individual disparate column chunks into contiguous data streams @@ -2187,9 +2188,9 @@ void writer::impl::write(table_view const& table) auto const max_compressed_block_size = max_compression_output_size(compression_kind_, compression_blocksize_); auto const padded_max_compressed_block_size = - util::round_up_unsafe(max_compressed_block_size, uncomp_block_align); + util::round_up_unsafe(max_compressed_block_size, compressed_block_align); auto const padded_block_header_size = - util::round_up_unsafe(block_header_size, uncomp_block_align); + util::round_up_unsafe(block_header_size, compressed_block_align); auto stream_output = [&]() { size_t max_stream_size = 0; @@ -2238,7 +2239,7 @@ void writer::impl::write(table_view const& table) compression_kind_, compression_blocksize_, max_compressed_block_size, - comp_block_alignment(compression_kind_), + compressed_block_align, strm_descs, enc_data.streams, comp_results, diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 671e34ac73d..999cad76d5d 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include "parquet_gpu.cuh" #include #include diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 28baad9c51c..f5ae262fa3f 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -150,7 +150,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s) // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or // isset.TIME or isset.TIMESTAMP or isset.INTEGER or isset.UNKNOWN or isset.JSON or isset.BSON) // { - if (isset.TIMESTAMP) { c.field_struct(10, s.logical_type); } + if (isset.TIMESTAMP or isset.TIME) { c.field_struct(10, s.logical_type); } return c.value(); } diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 531733a7df7..c580aa5bbc0 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -18,8 +18,10 @@ #include #include +#include #include #include +#include #include #include @@ -52,6 +54,8 @@ namespace io { namespace parquet { namespace gpu { +namespace { + struct page_state_s { const uint8_t* data_start; const uint8_t* data_end; @@ -146,11 +150,18 @@ __device__ uint32_t InitLevelSection(page_state_s* s, s->initial_rle_value[lvl] = 0; s->lvl_start[lvl] = cur; } else if (encoding == Encoding::RLE) { - if (cur + 4 < end) { - uint32_t run; + // V2 only uses RLE encoding, so only perform check here + if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) { + len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes; + } else if (cur + 4 < end) { len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24); cur += 4; - run = get_vlq32(cur, end); + } else { + len = 0; + s->error = 2; + } + if (!s->error) { + uint32_t run = get_vlq32(cur, end); s->initial_rle_run[lvl] = run; if (!(run & 1)) { int v = (cur < end) ? cur[0] : 0; @@ -163,9 +174,6 @@ __device__ uint32_t InitLevelSection(page_state_s* s, } s->lvl_start[lvl] = cur; if (cur > end) { s->error = 2; } - } else { - len = 0; - s->error = 2; } } else if (encoding == Encoding::BIT_PACKED) { len = (s->page.num_input_values * level_bits + 7) >> 3; @@ -176,7 +184,7 @@ __device__ uint32_t InitLevelSection(page_state_s* s, s->error = 3; len = 0; } - return (uint32_t)len; + return static_cast(len); } /** @@ -277,13 +285,18 @@ __device__ void gpuDecodeStream( * 31) * @param[in] t Warp1 thread ID (0..31) * - * @return The new output position + * @return A pair containing the new output position, and the total length of strings decoded (this + * will only be valid on thread 0 and if sizes_only is true) */ -__device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_pos, int t) +template +__device__ cuda::std::pair gpuDecodeDictionaryIndices(volatile page_state_s* s, + int target_pos, + int t) { const uint8_t* end = s->data_end; int dict_bits = s->dict_bits; int pos = s->dict_pos; + int str_len = 0; while (pos < target_pos) { int is_literal, batch_len; @@ -328,8 +341,11 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_p __syncwarp(); is_literal = shuffle(is_literal); batch_len = shuffle(batch_len); + + // compute dictionary index. + int dict_idx = 0; if (t < batch_len) { - int dict_idx = s->dict_val; + dict_idx = s->dict_val; if (is_literal) { int32_t ofs = (t - ((batch_len + 7) & ~7)) * dict_bits; const uint8_t* p = s->data_start + (ofs >> 3); @@ -349,11 +365,36 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_p dict_idx &= (1 << dict_bits) - 1; } } - s->dict_idx[(pos + t) & (non_zero_buffer_size - 1)] = dict_idx; + + // if we're not computing sizes, store off the dictionary index + if constexpr (!sizes_only) { s->dict_idx[(pos + t) & (non_zero_buffer_size - 1)] = dict_idx; } + } + + // if we're computing sizes, add the length(s) + if constexpr (sizes_only) { + int const len = [&]() { + if (t >= batch_len) { return 0; } + // we may end up decoding more indices than we asked for. so don't include those in the + // size calculation + if (pos + t >= target_pos) { return 0; } + // TODO: refactor this with gpuGetStringData / gpuGetStringSize + uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0; + if (target_pos && dict_pos < (uint32_t)s->dict_size) { + const auto* src = reinterpret_cast(s->dict_base + dict_pos); + return src->second; + } + return 0; + }(); + + using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage; + // note: str_len will only be valid on thread 0. + str_len += WarpReduce(temp_storage).Sum(len); } + pos += batch_len; } - return pos; + return {pos, str_len}; } /** @@ -420,17 +461,20 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s* s, int target_pos, in } /** - * @brief Parses the length and position of strings + * @brief Parses the length and position of strings and returns total length of all strings + * processed * * @param[in,out] s Page state input/output * @param[in] target_pos Target output position * @param[in] t Thread ID * - * @return The new output position + * @return Total length of strings processed */ -__device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_pos, int t) +__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, int target_pos, int t) { - int pos = s->dict_pos; + int pos = s->dict_pos; + int total_len = 0; + // This step is purely serial if (!t) { const uint8_t* cur = s->data_start; @@ -449,21 +493,26 @@ __device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_po s->dict_idx[pos & (non_zero_buffer_size - 1)] = k; s->str_len[pos & (non_zero_buffer_size - 1)] = len; k += len; + total_len += len; pos++; } s->dict_val = k; __threadfence_block(); } + + return total_len; } /** - * @brief Output a string descriptor + * @brief Retrieves string information for a string at the specified source position * - * @param[in,out] s Page state input/output + * @param[in] s Page state input * @param[in] src_pos Source position - * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) + * + * @return A pair containing a pointer to the string and its length */ -inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv) +inline __device__ cuda::std::pair gpuGetStringData(volatile page_state_s* s, + int src_pos) { const char* ptr = nullptr; size_t len = 0; @@ -486,6 +535,20 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo len = s->str_len[src_pos & (non_zero_buffer_size - 1)]; } } + + return {ptr, len}; +} + +/** + * @brief Output a string descriptor + * + * @param[in,out] s Page state input/output + * @param[in] src_pos Source position + * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash) + */ +inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv) +{ + auto [ptr, len] = gpuGetStringData(s, src_pos); if (s->dtype_len == 4) { // Output hash. This hash value is used if the option to convert strings to // categoricals is enabled. The seed value is chosen arbitrarily. @@ -814,14 +877,17 @@ static __device__ void gpuOutputGeneric(volatile page_state_s* s, * @param[in, out] s The local page state to be filled in * @param[in] p The global page to be copied from * @param[in] chunks The global list of chunks - * @param[in] num_rows Maximum number of rows to read * @param[in] min_row Crop all rows below min_row + * @param[in] num_rows Maximum number of rows to read + * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess + * step) */ static __device__ bool setupLocalPageInfo(page_state_s* const s, PageInfo const* p, device_span chunks, size_t min_row, - size_t num_rows) + size_t num_rows, + bool is_decode_step) { int t = threadIdx.x; int chunk_idx; @@ -872,15 +938,15 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, case BOOLEAN: s->dtype_len = 1; // Boolean are stored as 1 byte on the output break; - case INT32: + case INT32: [[fallthrough]]; case FLOAT: s->dtype_len = 4; break; case INT64: if (s->col.ts_clock_rate) { int32_t units = 0; - if (s->col.converted_type == TIME_MILLIS or s->col.converted_type == TIMESTAMP_MILLIS) { + // Duration types are not included because no scaling is done when reading + if (s->col.converted_type == TIMESTAMP_MILLIS) { units = cudf::timestamp_ms::period::den; - } else if (s->col.converted_type == TIME_MICROS or - s->col.converted_type == TIMESTAMP_MICROS) { + } else if (s->col.converted_type == TIMESTAMP_MICROS) { units = cudf::timestamp_us::period::den; } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) { units = cudf::timestamp_ns::period::den; @@ -890,7 +956,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, : (s->col.ts_clock_rate / units); } } - // Fall through to DOUBLE + [[fallthrough]]; case DOUBLE: s->dtype_len = 8; break; case INT96: s->dtype_len = 12; break; case BYTE_ARRAY: s->dtype_len = sizeof(string_index_pair); break; @@ -906,25 +972,41 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, : s->dtype_len <= sizeof(int64_t) ? sizeof(int64_t) : sizeof(__int128_t); } else if (data_type == INT32) { - if (dtype_len_out == 1) s->dtype_len = 1; // INT8 output - if (dtype_len_out == 2) s->dtype_len = 2; // INT16 output + if (dtype_len_out == 1) { + // INT8 output + s->dtype_len = 1; + } else if (dtype_len_out == 2) { + // INT16 output + s->dtype_len = 2; + } else if (s->col.converted_type == TIME_MILLIS) { + // INT64 output + s->dtype_len = 8; + } } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) { s->dtype_len = 4; // HASH32 output } else if (data_type == INT96) { s->dtype_len = 8; // Convert to 64-bit timestamp } - // first row within the page to output - if (page_start_row >= min_row) { - s->first_row = 0; - } else { - s->first_row = (int32_t)min(min_row - page_start_row, (size_t)s->page.num_rows); - } - // # of rows within the page to output - s->num_rows = s->page.num_rows; - if ((page_start_row + s->first_row) + s->num_rows > min_row + num_rows) { - s->num_rows = - (int32_t)max((int64_t)(min_row + num_rows - (page_start_row + s->first_row)), INT64_C(0)); + // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be + // invalid/bogus during first pass of the preprocess step for nested types. this is ok + // because we ignore these values in that stage. + { + auto const max_row = min_row + num_rows; + + // if we are totally outside the range of the input, do nothing + if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) { + s->first_row = 0; + s->num_rows = 0; + } + // otherwise + else { + s->first_row = page_start_row >= min_row ? 0 : min_row - page_start_row; + auto const max_page_rows = s->page.num_rows - s->first_row; + s->num_rows = (page_start_row + s->first_row) + max_page_rows <= max_row + ? max_page_rows + : max_row - (page_start_row + s->first_row); + } } // during the decoding step we need to offset the global output buffers @@ -932,7 +1014,11 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, // is responsible for. // - for flat schemas, we can do this directly by using row counts // - for nested schemas, these offsets are computed during the preprocess step - if (s->col.column_data_base != nullptr) { + // + // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base + // will be aliased to memory that has been freed when we get here in the non-decode step, so + // we cannot check against nullptr. we'll just check a flag directly. + if (is_decode_step) { int max_depth = s->col.max_nesting_depth; for (int idx = 0; idx < max_depth; idx++) { PageNestingInfo* pni = &s->page.nesting[idx]; @@ -942,12 +1028,13 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, if (s->col.max_level[level_type::REPETITION] == 0) { output_offset = page_start_row >= min_row ? page_start_row - min_row : 0; } - // for schemas with lists, we've already got the exactly value precomputed + // for schemas with lists, we've already got the exact value precomputed else { output_offset = pni->page_start_value; } pni->data_out = static_cast(s->col.column_data_base[idx]); + if (pni->data_out != nullptr) { // anything below max depth with a valid data pointer must be a list, so the // element size is the size of the offset type. @@ -1024,6 +1111,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, s->page.skipped_leaf_values = 0; s->input_value_count = 0; s->input_row_count = 0; + s->input_leaf_count = 0; s->row_index_lower_bound = -1; } @@ -1052,13 +1140,14 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s, // if we're in the decoding step, jump directly to the first // value we care about - if (s->col.column_data_base != nullptr) { + if (is_decode_step) { s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0; } else { - s->input_value_count = 0; - s->input_leaf_count = 0; - s->page.skipped_values = -1; - s->page.skipped_leaf_values = -1; + s->input_value_count = 0; + s->input_leaf_count = 0; + s->page.skipped_values = + -1; // magic number to indicate it hasn't been set for use inside UpdatePageSizes + s->page.skipped_leaf_values = 0; } } @@ -1175,7 +1264,8 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu int t) { // max nesting depth of the column - int const max_depth = s->col.max_nesting_depth; + int const max_depth = s->col.max_nesting_depth; + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; // how many (input) values we've processed in the page so far int input_value_count = s->input_value_count; // how many rows we've processed in the page so far @@ -1235,7 +1325,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu uint32_t const warp_valid_mask = // for flat schemas, a simple ballot_sync gives us the correct count and bit positions // because every value in the input matches to a value in the output - max_depth == 1 + !has_repetition ? ballot(is_valid) : // for nested schemas, it's more complicated. This warp will visit 32 incoming values, @@ -1284,11 +1374,12 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu // the correct position to start reading. since we are about to write the validity vector here // we need to adjust our computed mask to take into account the write row bounds. int const in_write_row_bounds = - max_depth == 1 + !has_repetition ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows) : in_row_bounds; int const first_thread_in_write_range = - max_depth == 1 ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; + !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0; + // # of bits to of the validity mask to write out int const warp_valid_mask_bit_count = first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range; @@ -1383,8 +1474,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s, bool bounds_set) { // max nesting depth of the column - int max_depth = s->col.max_nesting_depth; - // bool has_repetition = s->col.max_level[level_type::REPETITION] > 0 ? true : false; + int const max_depth = s->col.max_nesting_depth; // how many input level values we've processed in the page so far int input_value_count = s->input_value_count; // how many leaf values we've processed in the page so far @@ -1398,11 +1488,10 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s, start_depth, end_depth, d, s, input_value_count, target_input_value_count, t); // count rows and leaf values - int is_new_row = start_depth == 0 ? 1 : 0; - uint32_t warp_row_count_mask = ballot(is_new_row); - int is_new_leaf = (d >= s->page.nesting[max_depth - 1].max_def_level) ? 1 : 0; - uint32_t warp_leaf_count_mask = ballot(is_new_leaf); - + int const is_new_row = start_depth == 0 ? 1 : 0; + uint32_t const warp_row_count_mask = ballot(is_new_row); + int const is_new_leaf = (d >= s->page.nesting[max_depth - 1].max_def_level) ? 1 : 0; + uint32_t const warp_leaf_count_mask = ballot(is_new_leaf); // is this thread within row bounds? on the first pass we don't know the bounds, so we will be // computing the full size of the column. on the second pass, we will know our actual row // bounds, so the computation will cap sizes properly. @@ -1416,8 +1505,8 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s, ? 1 : 0; - uint32_t row_bounds_mask = ballot(in_row_bounds); - int first_thread_in_range = __ffs(row_bounds_mask) - 1; + uint32_t const row_bounds_mask = ballot(in_row_bounds); + int const first_thread_in_range = __ffs(row_bounds_mask) - 1; // if we've found the beginning of the first row, mark down the position // in the def/repetition buffer (skipped_values) and the data buffer (skipped_leaf_values) @@ -1430,13 +1519,15 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s, } } - // increment counts across all nesting depths + // increment value counts across all nesting depths for (int s_idx = 0; s_idx < max_depth; s_idx++) { - // if we are within the range of nesting levels we should be adding value indices for - int in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds) ? 1 : 0; + PageNestingInfo* pni = &s->page.nesting[s_idx]; - uint32_t count_mask = ballot(in_nesting_bounds); - if (!t) { s->page.nesting[s_idx].size += __popc(count_mask); } + // if we are within the range of nesting levels we should be adding value indices for + int const in_nesting_bounds = + (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds) ? 1 : 0; + uint32_t const count_mask = ballot(in_nesting_bounds); + if (!t) { pni->batch_size += __popc(count_mask); } } input_value_count += min(32, (target_input_value_count - input_value_count)); @@ -1452,6 +1543,21 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s, } } +__device__ size_type gpuGetStringSize(page_state_s* s, int target_count, int t) +{ + auto dict_target_pos = target_count; + size_type str_len = 0; + if (s->dict_base) { + auto const [new_target_pos, len] = gpuDecodeDictionaryIndices(s, target_count, t); + dict_target_pos = new_target_pos; + str_len = len; + } else if ((s->col.data_type & 7) == BYTE_ARRAY) { + str_len = gpuInitStringDescriptors(s, target_count, t); + } + if (!t) { *(volatile int32_t*)&s->dict_pos = dict_target_pos; } + return str_len; +} + /** * @brief Kernel for computing per-page column size information for all nesting levels. * @@ -1460,17 +1566,20 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s, * @param pages List of pages * @param chunks List of column chunks * @param min_row Row index to start reading at - * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows. - * @param trim_pass Whether or not this is the trim pass. We first have to compute + * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows + * @param is_base_pass Whether or not this is the base pass. We first have to compute * the full size information of every page before we come through in a second (trim) pass - * to determine what subset of rows in this page we should be reading. + * to determine what subset of rows in this page we should be reading + * @param compute_string_sizes Whether or not we should be computing string sizes + * (PageInfo::str_bytes) as part of the pass */ __global__ void __launch_bounds__(block_size) gpuComputePageSizes(PageInfo* pages, device_span chunks, size_t min_row, size_t num_rows, - bool trim_pass) + bool is_base_pass, + bool compute_string_sizes) { __shared__ __align__(16) page_state_s state_g; @@ -1479,32 +1588,82 @@ __global__ void __launch_bounds__(block_size) int t = threadIdx.x; PageInfo* pp = &pages[page_idx]; - if (!setupLocalPageInfo(s, pp, chunks, trim_pass ? min_row : 0, trim_pass ? num_rows : INT_MAX)) { - return; - } + if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; } - // zero sizes - int d = 0; - while (d < s->page.num_nesting_levels) { - if (d + t < s->page.num_nesting_levels) { s->page.nesting[d + t].size = 0; } - d += blockDim.x; - } if (!t) { s->page.skipped_values = -1; - s->page.skipped_leaf_values = -1; + s->page.skipped_leaf_values = 0; + s->page.str_bytes = 0; s->input_row_count = 0; s->input_value_count = 0; - // if this isn't the trim pass, make sure we visit absolutely everything - if (!trim_pass) { + // in the base pass, we're computing the number of rows, make sure we visit absolutely + // everything + if (is_base_pass) { s->first_row = 0; s->num_rows = INT_MAX; s->row_index_lower_bound = -1; } } - __syncthreads(); - bool has_repetition = s->col.max_level[level_type::REPETITION] > 0; + // we only need to preprocess hierarchies with repetition in them (ie, hierarchies + // containing lists anywhere within). + bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0; + compute_string_sizes = + compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4); + + // various early out optimizations: + + // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need + // to do + // the expensive work of traversing the level data to determine sizes. we can just compute it + // directly. + if (!has_repetition && !compute_string_sizes) { + int d = 0; + while (d < s->page.num_nesting_levels) { + auto const i = d + t; + if (i < s->page.num_nesting_levels) { + if (is_base_pass) { pp->nesting[i].size = pp->num_input_values; } + pp->nesting[i].batch_size = pp->num_input_values; + } + d += blockDim.x; + } + return; + } + + // - if this page is not at the beginning or end of the trim bounds, the batch size is + // the full page size + if (!is_base_pass && s->num_rows == s->page.num_rows) { + int d = 0; + while (d < s->page.num_nesting_levels) { + auto const i = d + t; + if (i < s->page.num_nesting_levels) { pp->nesting[i].batch_size = pp->nesting[i].size; } + d += blockDim.x; + } + return; + } + + // - if this page is completely trimmed, zero out sizes. + if (!is_base_pass && s->num_rows == 0) { + int d = 0; + while (d < s->page.num_nesting_levels) { + auto const i = d + t; + if (i < s->page.num_nesting_levels) { pp->nesting[i].batch_size = 0; } + d += blockDim.x; + } + return; + } + + // at this point we are going to be fully recomputing batch information + + // zero sizes + int d = 0; + while (d < s->page.num_nesting_levels) { + if (d + t < s->page.num_nesting_levels) { s->page.nesting[d + t].batch_size = 0; } + d += blockDim.x; + } + + __syncthreads(); // optimization : it might be useful to have a version of gpuDecodeStream that could go wider than // 1 warp. Currently it only uses 1 warp so that it can overlap work with the value decoding step @@ -1528,16 +1687,39 @@ __global__ void __launch_bounds__(block_size) : s->lvl_count[level_type::DEFINITION]; // process what we got back - gpuUpdatePageSizes(s, actual_input_count, t, trim_pass); + gpuUpdatePageSizes(s, actual_input_count, t, !is_base_pass); + if (compute_string_sizes) { + auto const str_len = gpuGetStringSize(s, s->input_leaf_count, t); + if (!t) { s->page.str_bytes += str_len; } + } + target_input_count = actual_input_count + batch_size; __syncwarp(); } } - // update # rows in the actual page + + // update output results: + // - real number of rows for the whole page + // - nesting sizes for the whole page + // - skipped value information for trimmed pages + // - string bytes + if (is_base_pass) { + // nesting level 0 is the root column, so the size is also the # of rows + if (!t) { pp->num_rows = s->page.nesting[0].batch_size; } + + // store off this batch size as the "full" size + int d = 0; + while (d < s->page.num_nesting_levels) { + auto const i = d + t; + if (i < s->page.num_nesting_levels) { pp->nesting[i].size = pp->nesting[i].batch_size; } + d += blockDim.x; + } + } + if (!t) { - pp->num_rows = s->page.nesting[0].size; pp->skipped_values = s->page.skipped_values; pp->skipped_leaf_values = s->page.skipped_leaf_values; + pp->str_bytes = s->page.str_bytes; } } @@ -1564,7 +1746,10 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( int t = threadIdx.x; int out_thread0; - if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows)) { return; } + if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; } + + // if we have no rows to do (eg, in a skip_rows/num_rows case) + if (s->num_rows == 0) { return; } if (s->dict_base) { out_thread0 = (s->dict_bits > 0) ? 64 : 32; @@ -1573,6 +1758,8 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32; } + bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0; + // skipped_leaf_values will always be 0 for flat hierarchies. uint32_t skipped_leaf_values = s->page.skipped_leaf_values; while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) { @@ -1599,7 +1786,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( // WARP1: Decode dictionary indices, booleans or string positions if (s->dict_base) { - src_target_pos = gpuDecodeDictionaryIndices(s, src_target_pos, t & 0x1f); + src_target_pos = gpuDecodeDictionaryIndices(s, src_target_pos, t & 0x1f).first; } else if ((s->col.data_type & 7) == BOOLEAN) { src_target_pos = gpuDecodeRleBooleans(s, src_target_pos, t & 0x1f); } else if ((s->col.data_type & 7) == BYTE_ARRAY) { @@ -1625,7 +1812,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( // - so we will end up ignoring the first two input rows, and input rows 2..n will // get written to the output starting at position 0. // - if (s->col.max_nesting_depth == 1) { dst_pos -= s->first_row; } + if (!has_repetition) { dst_pos -= s->first_row; } // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values // before first_row) in the flat hierarchy case. @@ -1663,7 +1850,12 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( } else if (dtype == INT96) { gpuOutputInt96Timestamp(s, val_src_pos, static_cast(dst)); } else if (dtype_len == 8) { - if (s->ts_scale) { + if (s->dtype_len_in == 4) { + // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS + // TIME_MILLIS is the only duration type stored as int32: + // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype + gpuOutputFast(s, val_src_pos, static_cast(dst)); + } else if (s->ts_scale) { gpuOutputInt64Timestamp(s, val_src_pos, static_cast(dst)); } else { gpuOutputFast(s, val_src_pos, static_cast(dst)); @@ -1681,71 +1873,18 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData( } } -struct chunk_row_output_iter { - PageInfo* p; - using value_type = size_type; - using difference_type = size_type; - using pointer = size_type*; - using reference = size_type&; - using iterator_category = thrust::output_device_iterator_tag; - - __host__ __device__ chunk_row_output_iter operator+(int i) - { - return chunk_row_output_iter{p + i}; - } - - __host__ __device__ void operator++() { p++; } - - __device__ reference operator[](int i) { return p[i].chunk_row; } - __device__ reference operator*() { return p->chunk_row; } - __device__ void operator=(value_type v) { p->chunk_row = v; } -}; - -struct start_offset_output_iterator { - PageInfo* pages; - int* page_indices; - int cur_index; - int src_col_schema; - int nesting_depth; - int empty = 0; - using value_type = size_type; - using difference_type = size_type; - using pointer = size_type*; - using reference = size_type&; - using iterator_category = thrust::output_device_iterator_tag; - - __host__ __device__ start_offset_output_iterator operator+(int i) - { - return start_offset_output_iterator{ - pages, page_indices, cur_index + i, src_col_schema, nesting_depth}; - } - - __host__ __device__ void operator++() { cur_index++; } - - __device__ reference operator[](int i) { return dereference(cur_index + i); } - __device__ reference operator*() { return dereference(cur_index); } - - private: - __device__ reference dereference(int index) - { - PageInfo const& p = pages[page_indices[index]]; - if (p.src_col_schema != src_col_schema || p.flags & PAGEINFO_FLAGS_DICTIONARY) { return empty; } - return p.nesting[nesting_depth].page_start_value; - } -}; +} // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::PreprocessColumnData + * @copydoc cudf::io::parquet::gpu::ComputePageSizes */ -void PreprocessColumnData(hostdevice_vector& pages, - hostdevice_vector const& chunks, - std::vector& input_columns, - std::vector& output_columns, - size_t num_rows, - size_t min_row, - bool uses_custom_row_bounds, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +void ComputePageSizes(hostdevice_vector& pages, + hostdevice_vector const& chunks, + size_t min_row, + size_t num_rows, + bool compute_num_rows, + bool compute_string_sizes, + rmm::cuda_stream_view stream) { dim3 dim_block(block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page @@ -1756,132 +1895,7 @@ void PreprocessColumnData(hostdevice_vector& pages, // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims" // the starting and ending read values to account for these bounds. gpuComputePageSizes<<>>( - pages.device_ptr(), - chunks, - // if uses_custom_row_bounds is false, include all possible rows. - uses_custom_row_bounds ? min_row : 0, - uses_custom_row_bounds ? num_rows : INT_MAX, - !uses_custom_row_bounds); - - // computes: - // PageInfo::chunk_row for all pages - auto key_input = thrust::make_transform_iterator( - pages.device_ptr(), [] __device__(PageInfo const& page) { return page.chunk_idx; }); - auto page_input = thrust::make_transform_iterator( - pages.device_ptr(), [] __device__(PageInfo const& page) { return page.num_rows; }); - thrust::exclusive_scan_by_key(rmm::exec_policy(stream), - key_input, - key_input + pages.size(), - page_input, - chunk_row_output_iter{pages.device_ptr()}); - - // computes: - // PageNestingInfo::size for each level of nesting, for each page, taking row bounds into account. - // PageInfo::skipped_values, which tells us where to start decoding in the input . - // It is only necessary to do this second pass if uses_custom_row_bounds is set (if the user has - // specified artifical bounds). - if (uses_custom_row_bounds) { - gpuComputePageSizes<<>>( - pages.device_ptr(), chunks, min_row, num_rows, true); - } - - // ordering of pages is by input column schema, repeated across row groups. so - // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like - // - // 1, 1, 2, 2, 3, 3 - // - // However, if we had more than one row group, the pattern would be - // - // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3 - // ^ row group 0 | - // ^ row group 1 - // - // To use exclusive_scan_by_key, the ordering we actually want is - // - // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 - // - // We also need to preserve key-relative page ordering, so we need to use a stable sort. - rmm::device_uvector page_keys(pages.size(), stream); - rmm::device_uvector page_index(pages.size(), stream); - { - thrust::transform(rmm::exec_policy(stream), - pages.device_ptr(), - pages.device_ptr() + pages.size(), - page_keys.begin(), - [] __device__(PageInfo const& page) { return page.src_col_schema; }); - - thrust::sequence(rmm::exec_policy(stream), page_index.begin(), page_index.end()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - page_index.begin(), - thrust::less()); - } - - // compute output column sizes by examining the pages of the -input- columns - for (size_t idx = 0; idx < input_columns.size(); idx++) { - auto const& input_col = input_columns[idx]; - auto src_col_schema = input_col.schema_idx; - size_t max_depth = input_col.nesting_depth(); - - auto* cols = &output_columns; - for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { - auto& out_buf = (*cols)[input_col.nesting[l_idx]]; - cols = &out_buf.children; - - // size iterator. indexes pages by sorted order - auto size_input = thrust::make_transform_iterator( - page_index.begin(), - [src_col_schema, l_idx, pages = pages.device_ptr()] __device__(int index) { - auto const& page = pages[index]; - if (page.src_col_schema != src_col_schema || page.flags & PAGEINFO_FLAGS_DICTIONARY) { - return 0; - } - return page.nesting[l_idx].size; - }); - - // compute column size. - // for struct columns, higher levels of the output columns are shared between input - // columns. so don't compute any given level more than once. - if (out_buf.size == 0) { - int size = thrust::reduce(rmm::exec_policy(stream), size_input, size_input + pages.size()); - - // Handle a specific corner case. It is possible to construct a parquet file such that - // a column within a row group contains more rows than the row group itself. This may be - // invalid, but we have seen instances of this in the wild, including how they were created - // using the apache parquet tools. Normally, the trim pass would handle this case quietly, - // but if we are not running the trim pass (which is most of the time) we need to cap the - // number of rows we will allocate/read from the file with the amount specified in the - // associated row group. This only applies to columns that are not children of lists as - // those may have an arbitrary number of rows in them. - if (!uses_custom_row_bounds && - !(out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) && - size > static_cast(num_rows)) { - size = static_cast(num_rows); - } - - // if this is a list column add 1 for non-leaf levels for the terminating offset - if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; } - - // allocate - out_buf.create(size, stream, mr); - } - - // compute per-page start offset - thrust::exclusive_scan_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - size_input, - start_offset_output_iterator{pages.device_ptr(), - page_index.begin(), - 0, - static_cast(src_col_schema), - static_cast(l_idx)}); - } - } - - // retrieve pages back - pages.device_to_host(stream); + pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes); } /** @@ -1893,6 +1907,8 @@ void __host__ DecodePageData(hostdevice_vector& pages, size_t min_row, rmm::cuda_stream_view stream) { + CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); + dim3 dim_block(block_size, 1); dim3 dim_grid(pages.size(), 1); // 1 threadblock per page diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index cdee066a06a..74e98de4100 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -13,7 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "parquet_gpu.hpp" + +#include "parquet_gpu.cuh" #include @@ -61,6 +62,12 @@ constexpr int32_t NO_TRUNC_STATS = 0; // minimum scratch space required for encoding statistics constexpr size_t MIN_STATS_SCRATCH_SIZE = sizeof(__int128_t); +// mask to determine lane id +constexpr uint32_t WARP_MASK = cudf::detail::warp_size - 1; + +// currently 64k - 1 +constexpr uint32_t MAX_GRID_Y_SIZE = (1 << 16) - 1; + struct frag_init_state_s { parquet_column_device_view col; PageFragment frag; @@ -116,82 +123,87 @@ __global__ void __launch_bounds__(block_size) using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage reduce_storage; - frag_init_state_s* const s = &state_g; - uint32_t t = threadIdx.x; - int frag_y = blockIdx.y; - auto const physical_type = col_desc[blockIdx.x].physical_type; + frag_init_state_s* const s = &state_g; + uint32_t const t = threadIdx.x; + auto const physical_type = col_desc[blockIdx.x].physical_type; + uint32_t const num_fragments_per_column = frag.size().second; - if (t == 0) s->col = col_desc[blockIdx.x]; + if (t == 0) { s->col = col_desc[blockIdx.x]; } __syncthreads(); - if (!t) { - // Find which partition this fragment came from - auto it = - thrust::upper_bound(thrust::seq, part_frag_offset.begin(), part_frag_offset.end(), frag_y); - int p = it - part_frag_offset.begin() - 1; - int part_end_row = partitions[p].start_row + partitions[p].num_rows; - s->frag.start_row = (frag_y - part_frag_offset[p]) * fragment_size + partitions[p].start_row; - - // frag.num_rows = fragment_size except for the last fragment in partition which can be smaller. - // num_rows is fixed but fragment size could be larger if the data is strings or nested. - s->frag.num_rows = min(fragment_size, part_end_row - s->frag.start_row); - s->frag.num_dict_vals = 0; - s->frag.fragment_data_size = 0; - s->frag.dict_data_size = 0; - - s->frag.start_value_idx = row_to_value_idx(s->frag.start_row, s->col); - size_type end_value_idx = row_to_value_idx(s->frag.start_row + s->frag.num_rows, s->col); - s->frag.num_leaf_values = end_value_idx - s->frag.start_value_idx; - - if (s->col.level_offsets != nullptr) { - // For nested schemas, the number of values in a fragment is not directly related to the - // number of encoded data elements or the number of rows. It is simply the number of - // repetition/definition values which together encode validity and nesting information. - size_type first_level_val_idx = s->col.level_offsets[s->frag.start_row]; - size_type last_level_val_idx = s->col.level_offsets[s->frag.start_row + s->frag.num_rows]; - s->frag.num_values = last_level_val_idx - first_level_val_idx; - } else { - s->frag.num_values = s->frag.num_rows; - } - } + auto const leaf_type = s->col.leaf_column->type().id(); auto const dtype_len = physical_type_len(physical_type, leaf_type); - __syncthreads(); - size_type nvals = s->frag.num_leaf_values; - size_type start_value_idx = s->frag.start_value_idx; - - for (uint32_t i = 0; i < nvals; i += block_size) { - uint32_t val_idx = start_value_idx + i + t; - uint32_t is_valid = (i + t < nvals && val_idx < s->col.leaf_column->size()) - ? s->col.leaf_column->is_valid(val_idx) - : 0; - uint32_t len; - if (is_valid) { - len = dtype_len; - if (physical_type == BYTE_ARRAY) { - switch (leaf_type) { - case type_id::STRING: { - auto str = s->col.leaf_column->element(val_idx); - len += str.size_bytes(); - } break; - case type_id::LIST: { - auto list_element = - get_element(*s->col.leaf_column, val_idx); - len += list_element.size_bytes(); - } break; - default: CUDF_UNREACHABLE("Unsupported data type for leaf column"); - } + for (uint32_t frag_y = blockIdx.y; frag_y < num_fragments_per_column; frag_y += gridDim.y) { + if (t == 0) { + // Find which partition this fragment came from + auto it = + thrust::upper_bound(thrust::seq, part_frag_offset.begin(), part_frag_offset.end(), frag_y); + int p = it - part_frag_offset.begin() - 1; + int part_end_row = partitions[p].start_row + partitions[p].num_rows; + s->frag.start_row = (frag_y - part_frag_offset[p]) * fragment_size + partitions[p].start_row; + + // frag.num_rows = fragment_size except for the last fragment in partition which can be + // smaller. num_rows is fixed but fragment size could be larger if the data is strings or + // nested. + s->frag.num_rows = min(fragment_size, part_end_row - s->frag.start_row); + s->frag.num_dict_vals = 0; + s->frag.fragment_data_size = 0; + s->frag.dict_data_size = 0; + + s->frag.start_value_idx = row_to_value_idx(s->frag.start_row, s->col); + size_type end_value_idx = row_to_value_idx(s->frag.start_row + s->frag.num_rows, s->col); + s->frag.num_leaf_values = end_value_idx - s->frag.start_value_idx; + + if (s->col.level_offsets != nullptr) { + // For nested schemas, the number of values in a fragment is not directly related to the + // number of encoded data elements or the number of rows. It is simply the number of + // repetition/definition values which together encode validity and nesting information. + size_type first_level_val_idx = s->col.level_offsets[s->frag.start_row]; + size_type last_level_val_idx = s->col.level_offsets[s->frag.start_row + s->frag.num_rows]; + s->frag.num_values = last_level_val_idx - first_level_val_idx; + } else { + s->frag.num_values = s->frag.num_rows; } - } else { - len = 0; } + __syncthreads(); + + size_type nvals = s->frag.num_leaf_values; + size_type start_value_idx = s->frag.start_value_idx; - len = block_reduce(reduce_storage).Sum(len); - if (!t) { s->frag.fragment_data_size += len; } + for (uint32_t i = 0; i < nvals; i += block_size) { + uint32_t val_idx = start_value_idx + i + t; + uint32_t is_valid = (i + t < nvals && val_idx < s->col.leaf_column->size()) + ? s->col.leaf_column->is_valid(val_idx) + : 0; + uint32_t len; + if (is_valid) { + len = dtype_len; + if (physical_type == BYTE_ARRAY) { + switch (leaf_type) { + case type_id::STRING: { + auto str = s->col.leaf_column->element(val_idx); + len += str.size_bytes(); + } break; + case type_id::LIST: { + auto list_element = + get_element(*s->col.leaf_column, val_idx); + len += list_element.size_bytes(); + } break; + default: CUDF_UNREACHABLE("Unsupported data type for leaf column"); + } + } + } else { + len = 0; + } + + len = block_reduce(reduce_storage).Sum(len); + if (t == 0) { s->frag.fragment_data_size += len; } + __syncthreads(); + } __syncthreads(); + if (t == 0) { frag[blockIdx.x][frag_y] = s->frag; } } - __syncthreads(); - if (t == 0) frag[blockIdx.x][blockIdx.y] = s->frag; } // blockDim {128,1,1} @@ -200,21 +212,29 @@ __global__ void __launch_bounds__(128) device_2dspan fragments, device_span col_desc) { - // TODO: why not 1 block per warp? - __shared__ __align__(8) statistics_group group_g[4]; - - uint32_t lane_id = threadIdx.x & 0x1f; - uint32_t frag_id = blockIdx.y * 4 + (threadIdx.x >> 5); - uint32_t column_id = blockIdx.x; - auto num_fragments_per_column = fragments.size().second; - statistics_group* const g = &group_g[threadIdx.x >> 5]; - if (!lane_id && frag_id < num_fragments_per_column) { - g->col = &col_desc[column_id]; - g->start_row = fragments[column_id][frag_id].start_value_idx; - g->num_rows = fragments[column_id][frag_id].num_leaf_values; + uint32_t const lane_id = threadIdx.x & WARP_MASK; + uint32_t const column_id = blockIdx.x; + uint32_t const num_fragments_per_column = fragments.size().second; + + uint32_t frag_id = blockIdx.y * 4 + (threadIdx.x / cudf::detail::warp_size); + while (frag_id < num_fragments_per_column) { + if (lane_id == 0) { + statistics_group g; + g.col = &col_desc[column_id]; + g.start_row = fragments[column_id][frag_id].start_value_idx; + g.num_rows = fragments[column_id][frag_id].num_leaf_values; + groups[column_id][frag_id] = g; + } + frag_id += gridDim.y * 4; } - __syncthreads(); - if (frag_id < num_fragments_per_column and lane_id == 0) groups[column_id][frag_id] = *g; +} + +constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values) +{ + if (value_bit_width == 0) return 0; + + // Run length = 4, max(rle/bitpack header) = 5, add one byte per 256 values for overhead + return 4 + 5 + util::div_rounding_up_unsafe(num_values * value_bit_width, 8) + (num_values / 256); } // blockDim {128,1,1} @@ -329,7 +349,7 @@ __global__ void __launch_bounds__(128) __syncwarp(); uint32_t fragment_data_size = (ck_g.use_dictionary) - ? frag_g.num_leaf_values * 2 // Assume worst-case of 2-bytes per dictionary index + ? frag_g.num_leaf_values * util::div_rounding_up_unsafe(ck_g.dict_rle_bits, 8) : frag_g.fragment_data_size; // TODO (dm): this convoluted logic to limit page size needs refactoring size_t this_max_page_size = (values_in_page * 2 >= ck_g.num_values) ? 256 * 1024 @@ -343,8 +363,8 @@ __global__ void __launch_bounds__(128) (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) || rows_in_page >= max_page_size_rows) { if (ck_g.use_dictionary) { - page_size = - 1 + 5 + ((values_in_page * ck_g.dict_rle_bits + 7) >> 3) + (values_in_page >> 8); + // Additional byte to store entry bit width + page_size = 1 + max_RLE_page_size(ck_g.dict_rle_bits, values_in_page); } if (!t) { page_g.num_fragments = fragments_in_chunk - page_start; @@ -367,23 +387,13 @@ __global__ void __launch_bounds__(128) if (not comp_page_sizes.empty()) { page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset; } - page_g.start_row = cur_row; - page_g.num_rows = rows_in_page; - page_g.num_leaf_values = leaf_values_in_page; - page_g.num_values = values_in_page; - uint32_t def_level_bits = col_g.num_def_level_bits(); - uint32_t rep_level_bits = col_g.num_rep_level_bits(); - // Run length = 4, max(rle/bitpack header) = 5, add one byte per 256 values for overhead - // TODO (dm): Improve readability of these calculations. - uint32_t def_level_size = - (def_level_bits != 0) - ? 4 + 5 + ((def_level_bits * page_g.num_values + 7) >> 3) + (page_g.num_values >> 8) - : 0; - uint32_t rep_level_size = - (rep_level_bits != 0) - ? 4 + 5 + ((rep_level_bits * page_g.num_values + 7) >> 3) + (page_g.num_values >> 8) - : 0; - page_g.max_data_size = page_size + def_level_size + rep_level_size; + page_g.start_row = cur_row; + page_g.num_rows = rows_in_page; + page_g.num_leaf_values = leaf_values_in_page; + page_g.num_values = values_in_page; + auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page); + auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page); + page_g.max_data_size = page_size + def_level_size + rep_level_size; pagestats_g.start_chunk = ck_g.first_fragment + page_start; pagestats_g.num_chunks = page_g.num_fragments; @@ -1100,15 +1110,20 @@ __global__ void __launch_bounds__(128, 8) if (t == 0) { s->cur = dst + total_len; } if (is_valid) { switch (physical_type) { - case INT32: + case INT32: [[fallthrough]]; case FLOAT: { - int32_t v; - if (dtype_len_in == 4) - v = s->col.leaf_column->element(val_idx); - else if (dtype_len_in == 2) - v = s->col.leaf_column->element(val_idx); - else - v = s->col.leaf_column->element(val_idx); + auto const v = [dtype_len = dtype_len_in, + idx = val_idx, + col = s->col.leaf_column, + scale = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t { + switch (dtype_len) { + case 8: return col->element(idx) * scale; + case 4: return col->element(idx) * scale; + case 2: return col->element(idx) * scale; + default: return col->element(idx) * scale; + } + }(); + dst[pos + 0] = v; dst[pos + 1] = v >> 8; dst[pos + 2] = v >> 16; @@ -2017,9 +2032,10 @@ void InitPageFragments(device_2dspan frag, uint32_t fragment_size, rmm::cuda_stream_view stream) { - auto num_columns = frag.size().first; - auto num_fragments_per_column = frag.size().second; - dim3 dim_grid(num_columns, num_fragments_per_column); // 1 threadblock per fragment + auto const num_columns = frag.size().first; + auto const num_fragments_per_column = frag.size().second; + auto const grid_y = std::min(static_cast(num_fragments_per_column), MAX_GRID_Y_SIZE); + dim3 const dim_grid(num_columns, grid_y); // 1 threadblock per fragment gpuInitPageFragments<512><<>>( frag, col_desc, partitions, part_frag_offset, fragment_size); } @@ -2031,8 +2047,10 @@ void InitFragmentStatistics(device_2dspan groups, { int const num_columns = col_desc.size(); int const num_fragments_per_column = fragments.size().second; - auto grid_y = util::div_rounding_up_safe(num_fragments_per_column, 128 / cudf::detail::warp_size); - dim3 dim_grid(num_columns, grid_y); // 1 warp per fragment + auto const y_dim = + util::div_rounding_up_safe(num_fragments_per_column, 128 / cudf::detail::warp_size); + auto const grid_y = std::min(static_cast(y_dim), MAX_GRID_Y_SIZE); + dim3 const dim_grid(num_columns, grid_y); // 1 warp per fragment gpuInitFragmentStats<<>>(groups, fragments, col_desc); } diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index e7856a871c1..ffb4cb60a20 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -307,10 +307,11 @@ struct gpuParseDataPageHeaderV2 { __device__ bool operator()(byte_stream_s* bs) { auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values), + ParquetFieldInt32(2, bs->page.num_nulls), ParquetFieldInt32(3, bs->page.num_rows), ParquetFieldEnum(4, bs->page.encoding), - ParquetFieldEnum(5, bs->page.definition_level_encoding), - ParquetFieldEnum(6, bs->page.repetition_level_encoding)); + ParquetFieldInt32(5, bs->page.def_lvl_bytes), + ParquetFieldInt32(6, bs->page.rep_lvl_bytes)); return parse_header(op, bs); } }; @@ -366,6 +367,7 @@ __global__ void __launch_bounds__(128) // definition levels bs->page.chunk_row = 0; bs->page.num_rows = 0; + bs->page.str_bytes = 0; } num_values = bs->ck.num_values; page_info = bs->ck.page_info; @@ -382,18 +384,30 @@ __global__ void __launch_bounds__(128) // definition levels bs->page.chunk_row += bs->page.num_rows; bs->page.num_rows = 0; + // zero out V2 info + bs->page.num_nulls = 0; + bs->page.def_lvl_bytes = 0; + bs->page.rep_lvl_bytes = 0; if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) { switch (bs->page_type) { case PageType::DATA_PAGE: + index_out = num_dict_pages + data_page_count; + data_page_count++; + bs->page.flags = 0; // this computation is only valid for flat schemas. for nested schemas, // they will be recomputed in the preprocess step by examining repetition and // definition levels bs->page.num_rows = bs->page.num_input_values; + values_found += bs->page.num_input_values; + break; case PageType::DATA_PAGE_V2: index_out = num_dict_pages + data_page_count; data_page_count++; bs->page.flags = 0; values_found += bs->page.num_input_values; + // V2 only uses RLE, so it was removed from the header + bs->page.definition_level_encoding = Encoding::RLE; + bs->page.repetition_level_encoding = Encoding::RLE; break; case PageType::DICTIONARY_PAGE: index_out = dictionary_page_count; diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh new file mode 100644 index 00000000000..793573b465e --- /dev/null +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "parquet_gpu.hpp" + +#include +#include + +#include + +namespace cudf::io::parquet::gpu { + +auto constexpr KEY_SENTINEL = size_type{-1}; +auto constexpr VALUE_SENTINEL = size_type{-1}; + +using map_type = cuco::static_map; + +/** + * @brief The alias of `map_type::pair_atomic_type` class. + * + * Declare this struct by trivial subclassing instead of type aliasing so we can have forward + * declaration of this struct somewhere else. + */ +struct slot_type : public map_type::pair_atomic_type { +}; + +/** + * @brief Return the byte length of parquet dtypes that are physically represented by INT32 + */ +inline uint32_t __device__ int32_logical_len(type_id id) +{ + switch (id) { + case cudf::type_id::INT8: [[fallthrough]]; + case cudf::type_id::UINT8: return 1; + case cudf::type_id::INT16: [[fallthrough]]; + case cudf::type_id::UINT16: return 2; + case cudf::type_id::DURATION_SECONDS: [[fallthrough]]; + case cudf::type_id::DURATION_MILLISECONDS: return 8; + default: return 4; + } +} + +/** + * @brief Translate the row index of a parent column_device_view into the index of the first value + * in the leaf child. + * Only works in the context of parquet writer where struct columns are previously modified s.t. + * they only have one immediate child. + */ +inline size_type __device__ row_to_value_idx(size_type idx, + parquet_column_device_view const& parquet_col) +{ + // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave + // the size at the parent level because we are writing out parent row byte arrays. + auto col = *parquet_col.parent_column; + while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { + if (col.type().id() == type_id::STRUCT) { + idx += col.offset(); + col = col.child(0); + } else { + auto list_col = cudf::detail::lists_column_device_view(col); + auto child = list_col.child(); + if (parquet_col.output_as_byte_array && child.type().id() == type_id::UINT8) { break; } + idx = list_col.offset_at(idx); + col = child; + } + } + return idx; +} + +} // namespace cudf::io::parquet::gpu diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 8f4cd5c6f3b..ccf4b056ae8 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -23,14 +23,10 @@ #include "io/utilities/column_buffer.hpp" #include "io/utilities/hostdevice_vector.hpp" -#include -#include -#include +#include #include #include -#include - #include #include #include @@ -39,9 +35,7 @@ #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet { using cudf::io::detail::string_index_pair; @@ -57,19 +51,21 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1; struct input_column_info { int schema_idx; std::string name; + bool has_repetition; // size == nesting depth. the associated real output // buffer index in the dest column for each level of nesting. std::vector nesting; + + input_column_info(int _schema_idx, std::string _name, bool _has_repetition) + : schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition) + { + } + auto nesting_depth() const { return nesting.size(); } }; namespace gpu { -auto constexpr KEY_SENTINEL = size_type{-1}; -auto constexpr VALUE_SENTINEL = size_type{-1}; -using map_type = cuco::static_map; -using slot_type = map_type::pair_atomic_type; - /** * @brief Enums for the flags in the page header */ @@ -99,9 +95,13 @@ struct PageNestingInfo { // set at initialization int32_t max_def_level; int32_t max_rep_level; + cudf::type_id type; // type of the corresponding cudf output column + bool nullable; // set during preprocessing - int32_t size; // this page/nesting-level's size contribution to the output column + int32_t size; // this page/nesting-level's row count contribution to the output column, if fully + // decoded + int32_t batch_size; // the size of the page for this batch int32_t page_start_value; // absolute output start index in output column data // set during data decoding @@ -121,6 +121,10 @@ struct PageInfo { // decompression int32_t compressed_page_size; // compressed data size in bytes int32_t uncompressed_page_size; // uncompressed data size in bytes + // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length + // indicator. instead the lengths for these are stored in the header. + int32_t def_lvl_bytes; // length of the definition levels (V2 header) + int32_t rep_lvl_bytes; // length of the repetition levels (V2 header) // Number of values in this data page or dictionary. // Important : the # of input values does not necessarily // correspond to the number of rows in the output. It just reflects the number @@ -131,6 +135,7 @@ struct PageInfo { int32_t num_input_values; int32_t chunk_row; // starting row of this page relative to the start of the chunk int32_t num_rows; // number of rows in this page + int32_t num_nulls; // number of null values (V2 header) int32_t chunk_idx; // column chunk this page belongs to int32_t src_col_schema; // schema index of this column uint8_t flags; // PAGEINFO_FLAGS_XXX @@ -150,6 +155,9 @@ struct PageInfo { int skipped_values; // # of values skipped in the actual data stream. int skipped_leaf_values; + // for string columns only, the size of all the chars in the string for + // this page. only valid/computed during the base preprocess pass + int32_t str_bytes; // nesting information (input/output) for each page int num_nesting_levels; @@ -235,6 +243,34 @@ struct ColumnChunkDesc { int32_t src_col_schema; // my schema index in the file }; +/** + * @brief Struct to store raw/intermediate file data before parsing. + */ +struct file_intermediate_data { + std::vector> raw_page_data; + rmm::device_buffer decomp_page_data; + hostdevice_vector chunks{}; + hostdevice_vector pages_info{}; + hostdevice_vector page_nesting_info{}; +}; + +/** + * @brief Struct to store intermediate page data for parsing each chunk of rows in chunked reading. + */ +struct chunk_intermediate_data { + rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; + rmm::device_uvector page_index{0, rmm::cuda_stream_default}; + rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; +}; + +/** + * @brief Structs to identify the reading row range for each chunk of rows in chunked reading. + */ +struct chunk_read_info { + size_t skip_rows; + size_t num_rows; +}; + /** * @brief Struct describing an encoder column */ @@ -281,51 +317,8 @@ struct PageFragment { constexpr unsigned int kDictHashBits = 16; constexpr size_t kDictScratchSize = (1 << kDictHashBits) * sizeof(uint32_t); -/** - * @brief Return the byte length of parquet dtypes that are physically represented by INT32 - */ -inline uint32_t __device__ int32_logical_len(type_id id) -{ - switch (id) { - case cudf::type_id::INT8: - case cudf::type_id::UINT8: return 1; - case cudf::type_id::INT16: - case cudf::type_id::UINT16: return 2; - default: return 4; - } -} - -/** - * @brief Translate the row index of a parent column_device_view into the index of the first value - * in the leaf child. - * Only works in the context of parquet writer where struct columns are previously modified s.t. - * they only have one immediate child. - */ -inline size_type __device__ row_to_value_idx(size_type idx, - parquet_column_device_view const& parquet_col) -{ - // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave - // the size at the parent level because we are writing out parent row byte arrays. - auto col = *parquet_col.parent_column; - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - if (col.type().id() == type_id::STRUCT) { - idx += col.offset(); - col = col.child(0); - } else { - auto list_col = cudf::detail::lists_column_device_view(col); - auto child = list_col.child(); - if (parquet_col.output_as_byte_array && - (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) { - break; - } - idx = list_col.offset_at(idx); - col = child; - } - } - return idx; -} - struct EncPage; +struct slot_type; /** * @brief Struct describing an encoder column chunk @@ -408,35 +401,35 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks, rmm::cuda_stream_view stream); /** - * @brief Preprocess column information for nested schemas. + * @brief Compute page output size information. + * + * When dealing with nested hierarchies (those that contain lists), or when doing a chunked + * read, we need to obtain more information up front than we have with just the row counts. * - * There are several pieces of information we can't compute directly from row counts in - * the parquet headers when dealing with nested schemas. - * - The total sizes of all output columns at all nesting levels - * - The starting output buffer offset for each page, for each nesting level - * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders) + * - We need to determine the sizes of each output cudf column per page + * - We need to determine information about where to start decoding the value stream + * if we are using custom user bounds (skip_rows / num_rows) + * - We need to determine actual number of top level rows per page + * - If we are doing a chunked read, we need to determine the total string size per page * - * Note : this function is where output device memory is allocated for nested columns. * * @param pages All pages to be decoded * @param chunks All chunks to be decoded - * @param input_columns Input column information - * @param output_columns Output column information * @param num_rows Maximum number of rows to read * @param min_rows crop all rows below min_row - * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific - * bounds - * @param stream Cuda stream + * @param compute_num_rows If set to true, the num_rows field in PageInfo will be + * computed + * @param compute_string_sizes If set to true, the str_bytes field in PageInfo will + * be computed + * @param stream CUDA stream to use, default 0 */ -void PreprocessColumnData(hostdevice_vector& pages, - hostdevice_vector const& chunks, - std::vector& input_columns, - std::vector& output_columns, - size_t num_rows, - size_t min_row, - bool uses_custom_row_bounds, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +void ComputePageSizes(hostdevice_vector& pages, + hostdevice_vector const& chunks, + size_t num_rows, + size_t min_row, + bool compute_num_rows, + bool compute_string_sizes, + rmm::cuda_stream_view stream); /** * @brief Launches kernel for reading the column data stored in the pages @@ -619,6 +612,4 @@ void EncodeColumnIndexes(device_span chunks, rmm::cuda_stream_view stream); } // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp new file mode 100644 index 00000000000..1321e8073d7 --- /dev/null +++ b/cpp/src/io/parquet/reader.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl.hpp" + +namespace cudf::io::detail::parquet { + +reader::reader() = default; + +reader::reader(std::vector>&& sources, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : _impl(std::make_unique(std::move(sources), options, stream, mr)) +{ +} + +reader::~reader() = default; + +table_with_metadata reader::read(parquet_reader_options const& options) +{ + // if the user has specified custom row bounds + bool const uses_custom_row_bounds = options.get_num_rows() >= 0 || options.get_skip_rows() != 0; + return _impl->read(options.get_skip_rows(), + options.get_num_rows(), + uses_custom_row_bounds, + options.get_row_groups()); +} + +chunked_reader::chunked_reader(std::size_t chunk_read_limit, + std::vector>&& sources, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + _impl = std::make_unique(chunk_read_limit, std::move(sources), options, stream, mr); +} + +chunked_reader::~chunked_reader() = default; + +bool chunked_reader::has_next() const { return _impl->has_next(); } + +table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); } + +} // namespace cudf::io::detail::parquet diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp new file mode 100644 index 00000000000..84d8cfc273f --- /dev/null +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl.hpp" + +#include + +#include + +namespace cudf::io::detail::parquet { + +void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) +{ + auto& chunks = _file_itm_data.chunks; + auto& pages = _file_itm_data.pages_info; + auto& page_nesting = _file_itm_data.page_nesting_info; + + // Should not reach here if there is no page data. + CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); + + size_t const sum_max_depths = std::accumulate( + chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { + return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); + }); + + // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector + // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the + // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i` + auto chunk_nested_valids = hostdevice_vector(sum_max_depths, _stream); + auto chunk_nested_data = hostdevice_vector(sum_max_depths, _stream); + auto chunk_offsets = std::vector(); + + // Update chunks with pointers to column data. + for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) { + input_column_info const& input_col = _input_columns[chunks[c].src_col_index]; + CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema, + "Column/page schema index mismatch"); + + size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema); + chunk_offsets.push_back(chunk_off); + + // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers + // to validity data + auto valids = chunk_nested_valids.host_ptr(chunk_off); + chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off); + + // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to + // out data + auto data = chunk_nested_data.host_ptr(chunk_off); + chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off); + + chunk_off += max_depth; + + // fill in the arrays on the host. there are some important considerations to + // take into account here for nested columns. specifically, with structs + // there is sharing of output buffers between input columns. consider this schema + // + // required group field_id=1 name { + // required binary field_id=2 firstname (String); + // required binary field_id=3 middlename (String); + // required binary field_id=4 lastname (String); + // } + // + // there are 3 input columns of data here (firstname, middlename, lastname), but + // only 1 output column (name). The structure of the output column buffers looks like + // the schema itself + // + // struct (name) + // string (firstname) + // string (middlename) + // string (lastname) + // + // The struct column can contain validity information. the problem is, the decode + // step for the input columns will all attempt to decode this validity information + // because each one has it's own copy of the repetition/definition levels. but + // since this is all happening in parallel it would mean multiple blocks would + // be stomping all over the same memory randomly. to work around this, we set + // things up so that only 1 child of any given nesting level fills in the + // data (offsets in the case of lists) or validity information for the higher + // levels of the hierarchy that are shared. In this case, it would mean we + // would just choose firstname to be the one that decodes the validity for name. + // + // we do this by only handing out the pointers to the first child we come across. + // + auto* cols = &_output_buffers; + for (size_t idx = 0; idx < max_depth; idx++) { + auto& out_buf = (*cols)[input_col.nesting[idx]]; + cols = &out_buf.children; + + int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; + if (owning_schema == 0 || owning_schema == input_col.schema_idx) { + valids[idx] = out_buf.null_mask(); + data[idx] = out_buf.data(); + out_buf.user_data |= + static_cast(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; + } else { + valids[idx] = nullptr; + data[idx] = nullptr; + } + } + + // column_data_base will always point to leaf data, even for nested types. + page_count += chunks[c].max_num_pages; + } + + chunks.host_to_device(_stream); + chunk_nested_valids.host_to_device(_stream); + chunk_nested_data.host_to_device(_stream); + + gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream); + + pages.device_to_host(_stream); + page_nesting.device_to_host(_stream); + _stream.synchronize(); + + // for list columns, add the final offset to every offset buffer. + // TODO : make this happen in more efficiently. Maybe use thrust::for_each + // on each buffer. + // Note : the reason we are doing this here instead of in the decode kernel is + // that it is difficult/impossible for a given page to know that it is writing the very + // last value that should then be followed by a terminator (because rows can span + // page boundaries). + for (size_t idx = 0; idx < _input_columns.size(); idx++) { + input_column_info const& input_col = _input_columns[idx]; + + auto* cols = &_output_buffers; + for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; + cols = &out_buf.children; + + if (out_buf.type.id() != type_id::LIST || + (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) { + continue; + } + CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column"); + auto& child = (*cols)[input_col.nesting[l_idx + 1]]; + + // the final offset for a list at level N is the size of it's child + int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; + cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), + &offset, + sizeof(offset), + cudaMemcpyHostToDevice, + _stream.value()); + out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; + } + } + + // update null counts in the final column buffers + for (size_t idx = 0; idx < pages.size(); idx++) { + gpu::PageInfo* pi = &pages[idx]; + if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } + gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; + input_column_info const& input_col = _input_columns[col->src_col_index]; + + int index = pi->nesting - page_nesting.device_ptr(); + gpu::PageNestingInfo* pni = &page_nesting[index]; + + auto* cols = &_output_buffers; + for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; + cols = &out_buf.children; + + // if I wasn't the one who wrote out the validity bits, skip it + if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) { + continue; + } + out_buf.null_count() += pni[l_idx].null_count; + } + } + + _stream.synchronize(); +} + +reader::impl::impl(std::vector>&& sources, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : impl(0 /*chunk_read_limit*/, + std::forward>>(sources), + options, + stream, + mr) +{ +} + +reader::impl::impl(std::size_t chunk_read_limit, + std::vector>&& sources, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : _stream{stream}, _mr{mr}, _sources{std::move(sources)}, _chunk_read_limit{chunk_read_limit} +{ + // Open and parse the source dataset metadata + _metadata = std::make_unique(_sources); + + // Override output timestamp resolution if requested + if (options.get_timestamp_type().id() != type_id::EMPTY) { + _timestamp_type = options.get_timestamp_type(); + } + + // Strings may be returned as either string or categorical columns + _strings_to_categorical = options.is_enabled_convert_strings_to_categories(); + + // Binary columns can be read as binary or strings + _reader_column_schema = options.get_column_schema(); + + // Select only columns required by the options + std::tie(_input_columns, _output_buffers, _output_column_schemas) = + _metadata->select_columns(options.get_columns(), + options.is_enabled_use_pandas_metadata(), + _strings_to_categorical, + _timestamp_type.id()); + + // Save the states of the output buffers for reuse in `chunk_read()`. + // Don't need to do it if we read the file all at once. + if (_chunk_read_limit > 0) { + for (auto const& buff : _output_buffers) { + _output_buffers_template.emplace_back(column_buffer::empty_like(buff)); + } + } +} + +void reader::impl::prepare_data(size_type skip_rows, + size_type num_rows, + bool uses_custom_row_bounds, + host_span const> row_group_indices) +{ + if (_file_preprocessed) { return; } + + const auto [skip_rows_corrected, num_rows_corrected, row_groups_info] = + _metadata->select_row_groups(row_group_indices, skip_rows, num_rows); + + if (num_rows_corrected > 0 && row_groups_info.size() != 0 && _input_columns.size() != 0) { + load_and_decompress_data(row_groups_info, num_rows_corrected); + preprocess_pages( + skip_rows_corrected, num_rows_corrected, uses_custom_row_bounds, _chunk_read_limit); + + if (_chunk_read_limit == 0) { // read the whole file at once + CUDF_EXPECTS(_chunk_read_info.size() == 1, + "Reading the whole file should yield only one chunk."); + } + } + + _file_preprocessed = true; +} + +table_with_metadata reader::impl::read_chunk_internal(bool uses_custom_row_bounds) +{ + // If `_output_metadata` has been constructed, just copy it over. + auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{}; + + // output cudf columns as determined by the top level schema + auto out_columns = std::vector>{}; + out_columns.reserve(_output_buffers.size()); + + if (!has_next() || _chunk_read_info.size() == 0) { + return finalize_output(out_metadata, out_columns); + } + + auto const& read_info = _chunk_read_info[_current_read_chunk++]; + + // Allocate memory buffers for the output columns. + allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds); + + // Parse data into the output buffers. + decode_page_data(read_info.skip_rows, read_info.num_rows); + + // Create the final output cudf columns. + for (size_t i = 0; i < _output_buffers.size(); ++i) { + auto const metadata = _reader_column_schema.has_value() + ? std::make_optional((*_reader_column_schema)[i]) + : std::nullopt; + // Only construct `out_metadata` if `_output_metadata` has not been cached. + if (!_output_metadata) { + column_name_info& col_name = out_metadata.schema_info.emplace_back(""); + out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream, _mr)); + } else { + out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream, _mr)); + } + } + + // Add empty columns if needed. + return finalize_output(out_metadata, out_columns); +} + +table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata, + std::vector>& out_columns) +{ + // Create empty columns as needed (this can happen if we've ended up with no actual data to read) + for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) { + if (!_output_metadata) { + column_name_info& col_name = out_metadata.schema_info.emplace_back(""); + out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr)); + } else { + out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], nullptr, _stream, _mr)); + } + } + + if (!_output_metadata) { + // Return column names (must match order of returned columns) + out_metadata.column_names.resize(_output_buffers.size()); + for (size_t i = 0; i < _output_column_schemas.size(); i++) { + auto const& schema = _metadata->get_schema(_output_column_schemas[i]); + out_metadata.column_names[i] = schema.name; + } + + // Return user metadata + out_metadata.per_file_user_data = _metadata->get_key_value_metadata(); + out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(), + out_metadata.per_file_user_data[0].end()}; + + // Finally, save the output table metadata into `_output_metadata` for reuse next time. + _output_metadata = std::make_unique(out_metadata); + } + + return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; +} + +table_with_metadata reader::impl::read(size_type skip_rows, + size_type num_rows, + bool uses_custom_row_bounds, + host_span const> row_group_indices) +{ + CUDF_EXPECTS(_chunk_read_limit == 0, "Reading the whole file must not have non-zero byte_limit."); + prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices); + return read_chunk_internal(uses_custom_row_bounds); +} + +table_with_metadata reader::impl::read_chunk() +{ + // Reset the output buffers to their original states (right after reader construction). + // Don't need to do it if we read the file all at once. + if (_chunk_read_limit > 0) { + _output_buffers.resize(0); + for (auto const& buff : _output_buffers_template) { + _output_buffers.emplace_back(column_buffer::empty_like(buff)); + } + } + + prepare_data(0 /*skip_rows*/, + -1 /*num_rows, `-1` means unlimited*/, + true /*uses_custom_row_bounds*/, + {} /*row_group_indices, empty means read all row groups*/); + return read_chunk_internal(true); +} + +bool reader::impl::has_next() +{ + prepare_data(0 /*skip_rows*/, + -1 /*num_rows, `-1` means unlimited*/, + true /*uses_custom_row_bounds*/, + {} /*row_group_indices, empty means read all row groups*/); + return _current_read_chunk < _chunk_read_info.size(); +} + +} // namespace cudf::io::detail::parquet diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu deleted file mode 100644 index 59bef6f5600..00000000000 --- a/cpp/src/io/parquet/reader_impl.cu +++ /dev/null @@ -1,1823 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file reader_impl.cu - * @brief cuDF-IO Parquet reader class implementation - */ - -#include "reader_impl.hpp" - -#include "compact_protocol_reader.hpp" - -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace cudf { -namespace io { -namespace detail { -namespace parquet { -// Import functionality that's independent of legacy code -using namespace cudf::io::parquet; -using namespace cudf::io; - -namespace { - -parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const& logical) -{ - if (logical.isset.STRING) { - return parquet::UTF8; - } else if (logical.isset.MAP) { - return parquet::MAP; - } else if (logical.isset.LIST) { - return parquet::LIST; - } else if (logical.isset.ENUM) { - return parquet::ENUM; - } else if (logical.isset.DECIMAL) { - return parquet::DECIMAL; // TODO set decimal values - } else if (logical.isset.DATE) { - return parquet::DATE; - } else if (logical.isset.TIME) { - if (logical.TIME.unit.isset.MILLIS) - return parquet::TIME_MILLIS; - else if (logical.TIME.unit.isset.MICROS) - return parquet::TIME_MICROS; - } else if (logical.isset.TIMESTAMP) { - if (logical.TIMESTAMP.unit.isset.MILLIS) - return parquet::TIMESTAMP_MILLIS; - else if (logical.TIMESTAMP.unit.isset.MICROS) - return parquet::TIMESTAMP_MICROS; - } else if (logical.isset.INTEGER) { - switch (logical.INTEGER.bitWidth) { - case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; - case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16; - case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32; - case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64; - default: break; - } - } else if (logical.isset.UNKNOWN) { - return parquet::NA; - } else if (logical.isset.JSON) { - return parquet::JSON; - } else if (logical.isset.BSON) { - return parquet::BSON; - } - return parquet::UNKNOWN; -} - -/** - * @brief Function that translates Parquet datatype to cuDF type enum - */ -type_id to_type_id(SchemaElement const& schema, - bool strings_to_categorical, - type_id timestamp_type_id) -{ - parquet::Type const physical = schema.type; - parquet::LogicalType const logical_type = schema.logical_type; - parquet::ConvertedType converted_type = schema.converted_type; - int32_t decimal_scale = schema.decimal_scale; - - // Logical type used for actual data interpretation; the legacy converted type - // is superceded by 'logical' type whenever available. - auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != parquet::UNKNOWN) converted_type = inferred_converted_type; - if (inferred_converted_type == parquet::DECIMAL && decimal_scale == 0) - decimal_scale = schema.logical_type.DECIMAL.scale; - - switch (converted_type) { - case parquet::UINT_8: return type_id::UINT8; - case parquet::INT_8: return type_id::INT8; - case parquet::UINT_16: return type_id::UINT16; - case parquet::INT_16: return type_id::INT16; - case parquet::UINT_32: return type_id::UINT32; - case parquet::UINT_64: return type_id::UINT64; - case parquet::DATE: return type_id::TIMESTAMP_DAYS; - case parquet::TIME_MILLIS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::DURATION_MILLISECONDS; - case parquet::TIME_MICROS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::DURATION_MICROSECONDS; - case parquet::TIMESTAMP_MILLIS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_MILLISECONDS; - case parquet::TIMESTAMP_MICROS: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_MICROSECONDS; - case parquet::DECIMAL: - if (physical == parquet::INT32) { return type_id::DECIMAL32; } - if (physical == parquet::INT64) { return type_id::DECIMAL64; } - if (physical == parquet::FIXED_LEN_BYTE_ARRAY) { - if (schema.type_length <= static_cast(sizeof(int32_t))) { - return type_id::DECIMAL32; - } - if (schema.type_length <= static_cast(sizeof(int64_t))) { - return type_id::DECIMAL64; - } - if (schema.type_length <= static_cast(sizeof(__int128_t))) { - return type_id::DECIMAL128; - } - } - CUDF_FAIL("Invalid representation of decimal type"); - break; - - // maps are just List>. - case parquet::MAP: - case parquet::LIST: return type_id::LIST; - case parquet::NA: return type_id::STRING; - // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support - default: break; - } - - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and - logical_type.TIMESTAMP.unit.isset.NANOS) { - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_NANOSECONDS; - } - - // is it simply a struct? - if (schema.is_struct()) { return type_id::STRUCT; } - - // Physical storage type supported by Parquet; controls the on-disk storage - // format in combination with the encoding type. - switch (physical) { - case parquet::BOOLEAN: return type_id::BOOL8; - case parquet::INT32: return type_id::INT32; - case parquet::INT64: return type_id::INT64; - case parquet::FLOAT: return type_id::FLOAT32; - case parquet::DOUBLE: return type_id::FLOAT64; - case parquet::BYTE_ARRAY: - case parquet::FIXED_LEN_BYTE_ARRAY: - // Can be mapped to INT32 (32-bit hash) or STRING - return strings_to_categorical ? type_id::INT32 : type_id::STRING; - case parquet::INT96: - return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id - : type_id::TIMESTAMP_NANOSECONDS; - default: break; - } - - return type_id::EMPTY; -} - -/** - * @brief Converts cuDF type enum to column logical type - */ -data_type to_data_type(type_id t_id, SchemaElement const& schema) -{ - return t_id == type_id::DECIMAL32 || t_id == type_id::DECIMAL64 || t_id == type_id::DECIMAL128 - ? data_type{t_id, numeric::scale_type{-schema.decimal_scale}} - : data_type{t_id}; -} - -/** - * @brief Function that returns the required the number of bits to store a value - */ -template -T required_bits(uint32_t max_level) -{ - return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); -} - -/** - * @brief Converts cuDF units to Parquet units. - * - * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. - */ -std::tuple conversion_info(type_id column_type_id, - type_id timestamp_type_id, - parquet::Type physical, - int8_t converted, - int32_t length) -{ - int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; - int32_t clock_rate = 0; - if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { - type_width = 1; // I32 -> I8 - } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { - type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::INT32) { - type_width = 4; // str -> hash32 - } else if (is_chrono(data_type{column_type_id})) { - clock_rate = to_clockrate(timestamp_type_id); - } - - int8_t converted_type = converted; - if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && - not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal - } - return std::make_tuple(type_width, clock_rate, converted_type); -} - -inline void decompress_check(device_span results, - rmm::cuda_stream_view stream) -{ - CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), - results.begin(), - results.end(), - [] __device__(auto const& res) { - return res.status == compression_status::SUCCESS; - }), - "Error during decompression"); -} -} // namespace - -std::string name_from_path(const std::vector& path_in_schema) -{ - // For the case of lists, we will see a schema that looks like: - // a.list.element.list.element - // where each (list.item) pair represents a level of nesting. According to the parquet spec, - // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md - // the initial field must be named "list" and the inner element must be named "element". - // If we are dealing with a list, we want to return the topmost name of the group ("a"). - // - // For other nested schemas, like structs we just want to return the bottom-most name. For - // example a struct with the schema - // b.employee.id, the column representing "id" should simply be named "id". - // - // In short, this means : return the highest level of the schema that does not have list - // definitions underneath it. - // - std::string s = (path_in_schema.size() > 0) ? path_in_schema[0] : ""; - for (size_t i = 1; i < path_in_schema.size(); i++) { - // The Parquet spec requires that the outer schema field is named "list". However it also - // provides a list of backwards compatibility cases that are applicable as well. Currently - // we are only handling the formal spec. This will get cleaned up and improved when we add - // support for structs. The correct thing to do will probably be to examine the type of - // the SchemaElement itself to concretely identify the start of a nested type of any kind rather - // than trying to derive it from the path string. - if (path_in_schema[i] == "list") { - // Again, strictly speaking, the Parquet spec says the inner field should be named - // "element", but there are some backwards compatibility issues that we have seen in the - // wild. For example, Pandas calls the field "item". We will allow any name for now. - i++; - continue; - } - // otherwise, we've got a real nested column. update the name - s = path_in_schema[i]; - } - return s; -} - -/** - * @brief Class for parsing dataset metadata - */ -struct metadata : public FileMetaData { - explicit metadata(datasource* source) - { - constexpr auto header_len = sizeof(file_header_s); - constexpr auto ender_len = sizeof(file_ender_s); - - const auto len = source->size(); - const auto header_buffer = source->host_read(0, header_len); - const auto header = reinterpret_cast(header_buffer->data()); - const auto ender_buffer = source->host_read(len - ender_len, ender_len); - const auto ender = reinterpret_cast(ender_buffer->data()); - CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source"); - CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic, - "Corrupted header or footer"); - CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len), - "Incorrect footer length"); - - const auto buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); - CompactProtocolReader cp(buffer->data(), ender->footer_len); - CUDF_EXPECTS(cp.read(this), "Cannot parse metadata"); - CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); - } -}; - -class aggregate_reader_metadata { - std::vector per_file_metadata; - std::vector> keyval_maps; - size_type num_rows; - size_type num_row_groups; - /** - * @brief Create a metadata object from each element in the source vector - */ - auto metadatas_from_sources(std::vector> const& sources) - { - std::vector metadatas; - std::transform( - sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) { - return metadata(source.get()); - }); - return metadatas; - } - - /** - * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps. - */ - [[nodiscard]] auto collect_keyval_metadata() - { - std::vector> kv_maps; - std::transform(per_file_metadata.cbegin(), - per_file_metadata.cend(), - std::back_inserter(kv_maps), - [](auto const& pfm) { - std::unordered_map kv_map; - std::transform(pfm.key_value_metadata.cbegin(), - pfm.key_value_metadata.cend(), - std::inserter(kv_map, kv_map.end()), - [](auto const& kv) { - return std::pair{kv.key, kv.value}; - }); - return kv_map; - }); - - return kv_maps; - } - - /** - * @brief Sums up the number of rows of each source - */ - [[nodiscard]] size_type calc_num_rows() const - { - return std::accumulate( - per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { - return sum + pfm.num_rows; - }); - } - - /** - * @brief Sums up the number of row groups of each source - */ - [[nodiscard]] size_type calc_num_row_groups() const - { - return std::accumulate( - per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { - return sum + pfm.row_groups.size(); - }); - } - - public: - aggregate_reader_metadata(std::vector> const& sources) - : per_file_metadata(metadatas_from_sources(sources)), - keyval_maps(collect_keyval_metadata()), - num_rows(calc_num_rows()), - num_row_groups(calc_num_row_groups()) - { - // Verify that the input files have matching numbers of columns - size_type num_cols = -1; - for (auto const& pfm : per_file_metadata) { - if (pfm.row_groups.size() != 0) { - if (num_cols == -1) - num_cols = pfm.row_groups[0].columns.size(); - else - CUDF_EXPECTS(num_cols == static_cast(pfm.row_groups[0].columns.size()), - "All sources must have the same number of columns"); - } - } - // Verify that the input files have matching schemas - for (auto const& pfm : per_file_metadata) { - CUDF_EXPECTS(per_file_metadata[0].schema == pfm.schema, - "All sources must have the same schemas"); - } - } - - [[nodiscard]] auto const& get_row_group(size_type row_group_index, size_type src_idx) const - { - CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast(per_file_metadata.size()), - "invalid source index"); - return per_file_metadata[src_idx].row_groups[row_group_index]; - } - - [[nodiscard]] auto const& get_column_metadata(size_type row_group_index, - size_type src_idx, - int schema_idx) const - { - auto col = std::find_if( - per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(), - per_file_metadata[src_idx].row_groups[row_group_index].columns.end(), - [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; }); - CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns), - "Found no metadata for schema index"); - return col->meta_data; - } - - [[nodiscard]] auto get_num_rows() const { return num_rows; } - - [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; } - - [[nodiscard]] auto const& get_schema(int schema_idx) const - { - return per_file_metadata[0].schema[schema_idx]; - } - - [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; } - - /** - * @brief Gets the concrete nesting depth of output cudf columns - * - * @param schema_index Schema index of the input column - * - * @return comma-separated index column names in quotes - */ - [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const - { - auto& pfm = per_file_metadata[0]; - int depth = 0; - - // walk upwards, skipping repeated fields - while (schema_index > 0) { - if (!pfm.schema[schema_index].is_stub()) { depth++; } - // schema of one-level encoding list doesn't contain nesting information, so we need to - // manually add an extra nesting level - if (pfm.schema[schema_index].is_one_level_list()) { depth++; } - schema_index = pfm.schema[schema_index].parent_idx; - } - return depth; - } - - /** - * @brief Extracts the pandas "index_columns" section - * - * PANDAS adds its own metadata to the key_value section when writing out the - * dataframe to a file to aid in exact reconstruction. The JSON-formatted - * metadata contains the index column(s) and PANDA-specific datatypes. - * - * @return comma-separated index column names in quotes - */ - [[nodiscard]] std::string get_pandas_index() const - { - // Assumes that all input files have the same metadata - // TODO: verify this assumption - auto it = keyval_maps[0].find("pandas"); - if (it != keyval_maps[0].end()) { - // Captures a list of quoted strings found inside square brackets after `"index_columns":` - // Inside quotes supports newlines, brackets, escaped quotes, etc. - // One-liner regex: - // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\] - // Documented below. - std::regex index_columns_expr{ - R"("index_columns"\s*:\s*\[\s*)" // match preamble, opening square bracket, whitespace - R"(()" // Open first capturing group - R"((?:")" // Open non-capturing group match opening quote - R"((?:|(?:.*?(?![^\\]")).?))" // match empty string or anything between quotes - R"([^\\]?")" // Match closing non-escaped quote - R"(,?\s*)" // Match optional comma and whitespace - R"()*)" // Close non-capturing group and repeat 0 or more times - R"())" // Close first capturing group - R"(\])" // Match closing square brackets - }; - std::smatch sm; - if (std::regex_search(it->second, sm, index_columns_expr)) { return sm[1].str(); } - } - return ""; - } - - /** - * @brief Extracts the column name(s) used for the row indexes in a dataframe - * - * @param names List of column names to load, where index column name(s) will be added - */ - [[nodiscard]] std::vector get_pandas_index_names() const - { - std::vector names; - auto str = get_pandas_index(); - if (str.length() != 0) { - std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"}; - std::smatch sm; - while (std::regex_search(str, sm, index_name_expr)) { - if (sm.size() == 2) { // 2 = whole match, first item - if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) { - std::regex esc_quote{R"(\\")"}; - names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")")); - } - } - str = sm.suffix(); - } - } - return names; - } - - struct row_group_info { - size_type const index; - size_t const start_row; // TODO source index - size_type const source_index; - row_group_info(size_type index, size_t start_row, size_type source_index) - : index(index), start_row(start_row), source_index(source_index) - { - } - }; - - /** - * @brief Filters and reduces down to a selection of row groups - * - * @param row_groups Lists of row groups to read, one per source - * @param row_start Starting row of the selection - * @param row_count Total number of rows selected - * - * @return List of row group indexes and its starting row - */ - [[nodiscard]] auto select_row_groups(std::vector> const& row_groups, - size_type& row_start, - size_type& row_count) const - { - if (!row_groups.empty()) { - std::vector selection; - CUDF_EXPECTS(row_groups.size() == per_file_metadata.size(), - "Must specify row groups for each source"); - - row_count = 0; - for (size_t src_idx = 0; src_idx < row_groups.size(); ++src_idx) { - for (auto const& rowgroup_idx : row_groups[src_idx]) { - CUDF_EXPECTS( - rowgroup_idx >= 0 && - rowgroup_idx < static_cast(per_file_metadata[src_idx].row_groups.size()), - "Invalid rowgroup index"); - selection.emplace_back(rowgroup_idx, row_count, src_idx); - row_count += get_row_group(rowgroup_idx, src_idx).num_rows; - } - } - return selection; - } - - row_start = std::max(row_start, 0); - if (row_count < 0) { - row_count = static_cast( - std::min(get_num_rows(), std::numeric_limits::max())); - } - row_count = min(row_count, get_num_rows() - row_start); - CUDF_EXPECTS(row_count >= 0, "Invalid row count"); - CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start"); - - std::vector selection; - size_type count = 0; - for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) { - for (size_t rg_idx = 0; rg_idx < per_file_metadata[src_idx].row_groups.size(); ++rg_idx) { - auto const chunk_start_row = count; - count += get_row_group(rg_idx, src_idx).num_rows; - if (count > row_start || count == 0) { - selection.emplace_back(rg_idx, chunk_start_row, src_idx); - } - if (count >= row_start + row_count) { break; } - } - } - - return selection; - } - - /** - * @brief Filters and reduces down to a selection of columns - * - * @param use_names List of paths of column names to select; `nullopt` if user did not select - * columns to read - * @param include_index Whether to always include the PANDAS index column(s) - * @param strings_to_categorical Type conversion parameter - * @param timestamp_type_id Type conversion parameter - * - * @return input column information, output column information, list of output column schema - * indices - */ - [[nodiscard]] auto select_columns(std::optional> const& use_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) const - { - auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) { - auto const& col_schema_idx = std::find_if( - schema_elem.children_idx.cbegin(), - schema_elem.children_idx.cend(), - [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; }); - - return (col_schema_idx != schema_elem.children_idx.end()) ? static_cast(*col_schema_idx) - : -1; - }; - - std::vector output_columns; - std::vector input_columns; - std::vector nesting; - - // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is - // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is - // not a child of "struct1" then the function will return false for "struct1" - std::function&, bool)> - build_column = [&](column_name_info const* col_name_info, - int schema_idx, - std::vector& out_col_array, - bool has_list_parent) { - if (schema_idx < 0) { return false; } - auto const& schema_elem = get_schema(schema_idx); - - // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer - // hierarchy. So continue on - if (schema_elem.is_stub()) { - // is this legit? - CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub"); - auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr; - return build_column( - child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent); - } - - // if we're at the root, this is a new output column - auto const col_type = - schema_elem.is_one_level_list() - ? type_id::LIST - : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); - auto const dtype = to_data_type(col_type, schema_elem); - - column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); - if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } - // store the index of this element if inserted in out_col_array - nesting.push_back(static_cast(out_col_array.size())); - output_col.name = schema_elem.name; - - // build each child - bool path_is_valid = false; - if (col_name_info == nullptr or col_name_info->children.empty()) { - // add all children of schema_elem. - // At this point, we can no longer pass a col_name_info to build_column - for (int idx = 0; idx < schema_elem.num_children; idx++) { - path_is_valid |= build_column(nullptr, - schema_elem.children_idx[idx], - output_col.children, - has_list_parent || col_type == type_id::LIST); - } - } else { - for (size_t idx = 0; idx < col_name_info->children.size(); idx++) { - path_is_valid |= - build_column(&col_name_info->children[idx], - find_schema_child(schema_elem, col_name_info->children[idx].name), - output_col.children, - has_list_parent || col_type == type_id::LIST); - } - } - - // if I have no children, we're at a leaf and I'm an input column (that is, one with actual - // data stored) so add me to the list. - if (schema_elem.num_children == 0) { - input_column_info& input_col = - input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name}); - - // set up child output column for one-level encoding list - if (schema_elem.is_one_level_list()) { - // determine the element data type - auto const element_type = - to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); - auto const element_dtype = to_data_type(element_type, schema_elem); - - column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); - if (has_list_parent || col_type == type_id::LIST) { - element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; - } - // store the index of this element - nesting.push_back(static_cast(output_col.children.size())); - // TODO: not sure if we should assign a name or leave it blank - element_col.name = "element"; - - output_col.children.push_back(std::move(element_col)); - } - - std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); - - // pop off the extra nesting element. - if (schema_elem.is_one_level_list()) { nesting.pop_back(); } - - path_is_valid = true; // If we're able to reach leaf then path is valid - } - - if (path_is_valid) { out_col_array.push_back(std::move(output_col)); } - - nesting.pop_back(); - return path_is_valid; - }; - - std::vector output_column_schemas; - - // - // there is not necessarily a 1:1 mapping between input columns and output columns. - // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns. - // The "structiness" is simply implied by the schema. For example, this schema: - // required group field_id=1 name { - // required binary field_id=2 firstname (String); - // required binary field_id=3 middlename (String); - // required binary field_id=4 lastname (String); - // } - // will only contain 3 internal columns of data (firstname, middlename, lastname). But of - // course "name" is ultimately the struct column we want to return. - // - // "firstname", "middlename" and "lastname" represent the input columns in the file that we - // process to produce the final cudf "name" column. - // - // A user can ask for a single field out of the struct e.g. firstname. - // In this case they'll pass a fully qualified name to the schema element like - // ["name", "firstname"] - // - auto const& root = get_schema(0); - if (not use_names.has_value()) { - for (auto const& schema_idx : root.children_idx) { - build_column(nullptr, schema_idx, output_columns, false); - output_column_schemas.push_back(schema_idx); - } - } else { - struct path_info { - std::string full_path; - int schema_idx; - }; - - // Convert schema into a vector of every possible path - std::vector all_paths; - std::function add_path = [&](std::string path_till_now, - int schema_idx) { - auto const& schema_elem = get_schema(schema_idx); - std::string curr_path = path_till_now + schema_elem.name; - all_paths.push_back({curr_path, schema_idx}); - for (auto const& child_idx : schema_elem.children_idx) { - add_path(curr_path + ".", child_idx); - } - }; - for (auto const& child_idx : get_schema(0).children_idx) { - add_path("", child_idx); - } - - // Find which of the selected paths are valid and get their schema index - std::vector valid_selected_paths; - for (auto const& selected_path : *use_names) { - auto found_path = - std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) { - return valid_path.full_path == selected_path; - }); - if (found_path != all_paths.end()) { - valid_selected_paths.push_back({selected_path, found_path->schema_idx}); - } - } - - // Now construct paths as vector of strings for further consumption - std::vector> use_names3; - std::transform(valid_selected_paths.begin(), - valid_selected_paths.end(), - std::back_inserter(use_names3), - [&](path_info const& valid_path) { - auto schema_idx = valid_path.schema_idx; - std::vector result_path; - do { - SchemaElement const& elem = get_schema(schema_idx); - result_path.push_back(elem.name); - schema_idx = elem.parent_idx; - } while (schema_idx > 0); - return std::vector(result_path.rbegin(), result_path.rend()); - }); - - std::vector selected_columns; - if (include_index) { - std::vector index_names = get_pandas_index_names(); - std::transform(index_names.cbegin(), - index_names.cend(), - std::back_inserter(selected_columns), - [](std::string const& name) { return column_name_info(name); }); - } - // Merge the vector use_names into a set of hierarchical column_name_info objects - /* This is because if we have columns like this: - * col1 - * / \ - * s3 f4 - * / \ - * f5 f6 - * - * there may be common paths in use_names like: - * {"col1", "s3", "f5"}, {"col1", "f4"} - * which means we want the output to contain - * col1 - * / \ - * s3 f4 - * / - * f5 - * - * rather than - * col1 col1 - * | | - * s3 f4 - * | - * f5 - */ - for (auto const& path : use_names3) { - auto array_to_find_in = &selected_columns; - for (size_t depth = 0; depth < path.size(); ++depth) { - // Check if the path exists in our selected_columns and if not, add it. - auto const& name_to_find = path[depth]; - auto found_col = std::find_if( - array_to_find_in->begin(), - array_to_find_in->end(), - [&name_to_find](column_name_info const& col) { return col.name == name_to_find; }); - if (found_col == array_to_find_in->end()) { - auto& col = array_to_find_in->emplace_back(name_to_find); - array_to_find_in = &col.children; - } else { - // Path exists. go down further. - array_to_find_in = &found_col->children; - } - } - } - for (auto& col : selected_columns) { - auto const& top_level_col_schema_idx = find_schema_child(root, col.name); - bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false); - if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx); - } - } - - return std::make_tuple( - std::move(input_columns), std::move(output_columns), std::move(output_column_schemas)); - } -}; - -/** - * @brief Generate depth remappings for repetition and definition levels. - * - * When dealing with columns that contain lists, we must examine incoming - * repetition and definition level pairs to determine what range of output nesting - * is indicated when adding new values. This function generates the mappings of - * the R/D levels to those start/end bounds - * - * @param remap Maps column schema index to the R/D remapping vectors for that column - * @param src_col_schema The column schema to generate the new mapping for - * @param md File metadata information - */ -void generate_depth_remappings(std::map, std::vector>>& remap, - int src_col_schema, - aggregate_reader_metadata const& md) -{ - // already generated for this level - if (remap.find(src_col_schema) != remap.end()) { return; } - auto schema = md.get_schema(src_col_schema); - int max_depth = md.get_output_nesting_depth(src_col_schema); - - CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(), - "Attempting to remap a schema more than once"); - auto inserted = - remap.insert(std::pair, std::vector>>{src_col_schema, {}}); - auto& depth_remap = inserted.first->second; - - std::vector& rep_depth_remap = (depth_remap.first); - rep_depth_remap.resize(schema.max_repetition_level + 1); - std::vector& def_depth_remap = (depth_remap.second); - def_depth_remap.resize(schema.max_definition_level + 1); - - // the key: - // for incoming level values R/D - // add values starting at the shallowest nesting level X has repetition level R - // until you reach the deepest nesting level Y that corresponds to the repetition level R1 - // held by the nesting level that has definition level D - // - // Example: a 3 level struct with a list at the bottom - // - // R / D Depth - // level0 0 / 1 0 - // level1 0 / 2 1 - // level2 0 / 3 2 - // list 0 / 3 3 - // element 1 / 4 4 - // - // incoming R/D : 0, 0 -> add values from depth 0 to 3 (def level 0 always maps to depth 0) - // incoming R/D : 0, 1 -> add values from depth 0 to 3 - // incoming R/D : 0, 2 -> add values from depth 0 to 3 - // incoming R/D : 1, 4 -> add values from depth 4 to 4 - // - // Note : the -validity- of values is simply checked by comparing the incoming D value against the - // D value of the given nesting level (incoming D >= the D for the nesting level == valid, - // otherwise NULL). The tricky part is determining what nesting levels to add values at. - // - // For schemas with no repetition level (no lists), X is always 0 and Y is always max nesting - // depth. - // - - // compute "X" from above - for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) { - auto find_shallowest = [&](int r) { - int shallowest = -1; - int cur_depth = max_depth - 1; - int schema_idx = src_col_schema; - while (schema_idx > 0) { - auto cur_schema = md.get_schema(schema_idx); - if (cur_schema.max_repetition_level == r) { - // if this is a repeated field, map it one level deeper - shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth; - } - // if it's one-level encoding list - else if (cur_schema.is_one_level_list()) { - shallowest = cur_depth - 1; - } - if (!cur_schema.is_stub()) { cur_depth--; } - schema_idx = cur_schema.parent_idx; - } - return shallowest; - }; - rep_depth_remap[s_idx] = find_shallowest(s_idx); - } - - // compute "Y" from above - for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) { - auto find_deepest = [&](int d) { - SchemaElement prev_schema; - int schema_idx = src_col_schema; - int r1 = 0; - while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); - if (cur_schema.max_definition_level == d) { - // if this is a repeated field, map it one level deeper - r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level - : cur_schema.max_repetition_level; - break; - } - prev_schema = cur_schema; - schema_idx = cur_schema.parent_idx; - } - - // we now know R1 from above. return the deepest nesting level that has the - // same repetition level - schema_idx = src_col_schema; - int depth = max_depth - 1; - while (schema_idx > 0) { - SchemaElement cur_schema = md.get_schema(schema_idx); - if (cur_schema.max_repetition_level == r1) { - // if this is a repeated field, map it one level deeper - depth = cur_schema.is_stub() ? depth + 1 : depth; - break; - } - if (!cur_schema.is_stub()) { depth--; } - prev_schema = cur_schema; - schema_idx = cur_schema.parent_idx; - } - return depth; - }; - def_depth_remap[s_idx] = find_deepest(s_idx); - } -} - -/** - * @copydoc cudf::io::detail::parquet::read_column_chunks - */ -std::future reader::impl::read_column_chunks( - std::vector>& page_data, - hostdevice_vector& chunks, // TODO const? - size_t begin_chunk, - size_t end_chunk, - const std::vector& column_chunk_offsets, - std::vector const& chunk_source_map) -{ - // Transfer chunk data, coalescing adjacent chunks - std::vector> read_tasks; - for (size_t chunk = begin_chunk; chunk < end_chunk;) { - const size_t io_offset = column_chunk_offsets[chunk]; - size_t io_size = chunks[chunk].compressed_size; - size_t next_chunk = chunk + 1; - const bool is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED); - while (next_chunk < end_chunk) { - const size_t next_offset = column_chunk_offsets[next_chunk]; - const bool is_next_compressed = - (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED); - if (next_offset != io_offset + io_size || is_next_compressed != is_compressed) { - // Can't merge if not contiguous or mixing compressed and uncompressed - // Not coalescing uncompressed with compressed chunks is so that compressed buffers can be - // freed earlier (immediately after decompression stage) to limit peak memory requirements - break; - } - io_size += chunks[next_chunk].compressed_size; - next_chunk++; - } - if (io_size != 0) { - auto& source = _sources[chunk_source_map[chunk]]; - if (source->is_device_read_preferred(io_size)) { - auto buffer = rmm::device_buffer(io_size, _stream); - auto fut_read_size = source->device_read_async( - io_offset, io_size, static_cast(buffer.data()), _stream); - read_tasks.emplace_back(std::move(fut_read_size)); - page_data[chunk] = datasource::buffer::create(std::move(buffer)); - } else { - auto const buffer = source->host_read(io_offset, io_size); - page_data[chunk] = - datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), _stream)); - } - auto d_compdata = page_data[chunk]->data(); - do { - chunks[chunk].compressed_data = d_compdata; - d_compdata += chunks[chunk].compressed_size; - } while (++chunk != next_chunk); - } else { - chunk = next_chunk; - } - } - auto sync_fn = [](decltype(read_tasks) read_tasks) { - for (auto& task : read_tasks) { - task.wait(); - } - }; - return std::async(std::launch::deferred, sync_fn, std::move(read_tasks)); -} - -/** - * @copydoc cudf::io::detail::parquet::count_page_headers - */ -size_t reader::impl::count_page_headers(hostdevice_vector& chunks) -{ - size_t total_pages = 0; - - chunks.host_to_device(_stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream); - chunks.device_to_host(_stream, true); - - for (size_t c = 0; c < chunks.size(); c++) { - total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages; - } - - return total_pages; -} - -/** - * @copydoc cudf::io::detail::parquet::decode_page_headers - */ -void reader::impl::decode_page_headers(hostdevice_vector& chunks, - hostdevice_vector& pages) -{ - // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), - // please update preprocess_nested_columns to reflect this. - for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { - chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages; - chunks[c].page_info = pages.device_ptr(page_count); - page_count += chunks[c].max_num_pages; - } - - chunks.host_to_device(_stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream); - pages.device_to_host(_stream, true); -} - -/** - * @copydoc cudf::io::detail::parquet::decompress_page_data - */ -rmm::device_buffer reader::impl::decompress_page_data( - hostdevice_vector& chunks, hostdevice_vector& pages) -{ - auto for_each_codec_page = [&](parquet::Compression codec, const std::function& f) { - for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { - const auto page_stride = chunks[c].max_num_pages; - if (chunks[c].codec == codec) { - for (int k = 0; k < page_stride; k++) { - f(page_count + k); - } - } - page_count += page_stride; - } - }; - - // Brotli scratch memory for decompressing - rmm::device_buffer debrotli_scratch; - - // Count the exact number of compressed pages - size_t num_comp_pages = 0; - size_t total_decomp_size = 0; - - struct codec_stats { - parquet::Compression compression_type = UNCOMPRESSED; - size_t num_pages = 0; - int32_t max_decompressed_size = 0; - size_t total_decomp_size = 0; - }; - - std::array codecs{codec_stats{parquet::GZIP}, - codec_stats{parquet::SNAPPY}, - codec_stats{parquet::BROTLI}, - codec_stats{parquet::ZSTD}}; - - auto is_codec_supported = [&codecs](int8_t codec) { - if (codec == parquet::UNCOMPRESSED) return true; - return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { - return codec == cstats.compression_type; - }) != codecs.end(); - }; - CUDF_EXPECTS(std::all_of(chunks.begin(), - chunks.end(), - [&is_codec_supported](auto const& chunk) { - return is_codec_supported(chunk.codec); - }), - "Unsupported compression type"); - - for (auto& codec : codecs) { - for_each_codec_page(codec.compression_type, [&](size_t page) { - auto page_uncomp_size = pages[page].uncompressed_page_size; - total_decomp_size += page_uncomp_size; - codec.total_decomp_size += page_uncomp_size; - codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size); - codec.num_pages++; - num_comp_pages++; - }); - if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) { - debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), _stream); - } - } - - // Dispatch batches of pages to decompress for each codec - rmm::device_buffer decomp_pages(total_decomp_size, _stream); - - std::vector> comp_in; - comp_in.reserve(num_comp_pages); - std::vector> comp_out; - comp_out.reserve(num_comp_pages); - - rmm::device_uvector comp_res(num_comp_pages, _stream); - thrust::fill(rmm::exec_policy(_stream), - comp_res.begin(), - comp_res.end(), - compression_result{0, compression_status::FAILURE}); - - size_t decomp_offset = 0; - int32_t start_pos = 0; - for (const auto& codec : codecs) { - if (codec.num_pages == 0) { continue; } - - for_each_codec_page(codec.compression_type, [&](size_t page) { - auto dst_base = static_cast(decomp_pages.data()); - comp_in.emplace_back(pages[page].page_data, - static_cast(pages[page].compressed_page_size)); - comp_out.emplace_back(dst_base + decomp_offset, - static_cast(pages[page].uncompressed_page_size)); - - pages[page].page_data = static_cast(comp_out.back().data()); - decomp_offset += comp_out.back().size(); - }); - - host_span const> comp_in_view{comp_in.data() + start_pos, - codec.num_pages}; - auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, _stream); - host_span const> comp_out_view(comp_out.data() + start_pos, - codec.num_pages); - auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, _stream); - device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); - - switch (codec.compression_type) { - case parquet::GZIP: - gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, _stream); - break; - case parquet::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { - nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, - d_comp_in, - d_comp_out, - d_comp_res_view, - codec.max_decompressed_size, - codec.total_decomp_size, - _stream); - } else { - gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, _stream); - } - break; - case parquet::ZSTD: - nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, - d_comp_in, - d_comp_out, - d_comp_res_view, - codec.max_decompressed_size, - codec.total_decomp_size, - _stream); - break; - case parquet::BROTLI: - gpu_debrotli(d_comp_in, - d_comp_out, - d_comp_res_view, - debrotli_scratch.data(), - debrotli_scratch.size(), - _stream); - break; - default: CUDF_FAIL("Unexpected decompression dispatch"); break; - } - start_pos += codec.num_pages; - } - - decompress_check(comp_res, _stream); - - // Update the page information in device memory with the updated value of - // page_data; it now points to the uncompressed data buffer - pages.host_to_device(_stream); - - return decomp_pages; -} - -/** - * @copydoc cudf::io::detail::parquet::allocate_nesting_info - */ -void reader::impl::allocate_nesting_info(hostdevice_vector const& chunks, - hostdevice_vector& pages, - hostdevice_vector& page_nesting_info) -{ - // compute total # of page_nesting infos needed and allocate space. doing this in one - // buffer to keep it to a single gpu allocation - size_t const total_page_nesting_infos = std::accumulate( - chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) { - // the schema of the input column - auto const& schema = _metadata->get_schema(chunk.src_col_schema); - auto const per_page_nesting_info_size = max( - schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema)); - return total + (per_page_nesting_info_size * chunk.num_data_pages); - }); - - page_nesting_info = hostdevice_vector{total_page_nesting_infos, _stream}; - - // retrieve from the gpu so we can update - pages.device_to_host(_stream, true); - - // update pointers in the PageInfos - int target_page_index = 0; - int src_info_index = 0; - for (size_t idx = 0; idx < chunks.size(); idx++) { - int src_col_schema = chunks[idx].src_col_schema; - auto& schema = _metadata->get_schema(src_col_schema); - auto const per_page_nesting_info_size = std::max( - schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema)); - - // skip my dict pages - target_page_index += chunks[idx].num_dict_pages; - for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index; - pages[target_page_index + p_idx].num_nesting_levels = per_page_nesting_info_size; - - src_info_index += per_page_nesting_info_size; - } - target_page_index += chunks[idx].num_data_pages; - } - - // copy back to the gpu - pages.host_to_device(_stream); - - // fill in - int nesting_info_index = 0; - std::map, std::vector>> depth_remapping; - for (size_t idx = 0; idx < chunks.size(); idx++) { - int src_col_schema = chunks[idx].src_col_schema; - - // schema of the input column - auto& schema = _metadata->get_schema(src_col_schema); - // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc) - int max_depth = _metadata->get_output_nesting_depth(src_col_schema); - - // # of nesting infos stored per page for this column - auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth); - - // if this column has lists, generate depth remapping - std::map, std::vector>> depth_remapping; - if (schema.max_repetition_level > 0) { - generate_depth_remappings(depth_remapping, src_col_schema, *_metadata); - } - - // fill in host-side nesting info - int schema_idx = src_col_schema; - auto cur_schema = _metadata->get_schema(schema_idx); - int cur_depth = max_depth - 1; - while (schema_idx > 0) { - // stub columns (basically the inner field of a list scheme element) are not real columns. - // we can ignore them for the purposes of output nesting info - if (!cur_schema.is_stub()) { - // initialize each page within the chunk - for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - gpu::PageNestingInfo* pni = - &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; - - // if we have lists, set our start and end depth remappings - if (schema.max_repetition_level > 0) { - auto remap = depth_remapping.find(src_col_schema); - CUDF_EXPECTS(remap != depth_remapping.end(), - "Could not find depth remapping for schema"); - std::vector const& rep_depth_remap = (remap->second.first); - std::vector const& def_depth_remap = (remap->second.second); - - for (size_t m = 0; m < rep_depth_remap.size(); m++) { - pni[m].start_depth = rep_depth_remap[m]; - } - for (size_t m = 0; m < def_depth_remap.size(); m++) { - pni[m].end_depth = def_depth_remap[m]; - } - } - - // values indexed by output column index - pni[cur_depth].max_def_level = cur_schema.max_definition_level; - pni[cur_depth].max_rep_level = cur_schema.max_repetition_level; - pni[cur_depth].size = 0; - } - - // move up the hierarchy - cur_depth--; - } - - // next schema - schema_idx = cur_schema.parent_idx; - cur_schema = _metadata->get_schema(schema_idx); - } - - nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages); - } - - // copy nesting info to the device - page_nesting_info.host_to_device(_stream); -} - -/** - * @copydoc cudf::io::detail::parquet::preprocess_columns - */ -void reader::impl::preprocess_columns(hostdevice_vector& chunks, - hostdevice_vector& pages, - size_t min_row, - size_t total_rows, - bool uses_custom_row_bounds, - bool has_lists) -{ - // TODO : we should be selectively preprocessing only columns that have - // lists in them instead of doing them all if even one contains lists. - - // if there are no lists, simply allocate every allocate every output - // column to be of size num_rows - if (!has_lists) { - std::function&)> create_columns = - [&](std::vector& cols) { - for (size_t idx = 0; idx < cols.size(); idx++) { - auto& col = cols[idx]; - col.create(total_rows, _stream, _mr); - create_columns(col.children); - } - }; - create_columns(_output_columns); - } else { - // preprocess per-nesting level sizes by page - gpu::PreprocessColumnData(pages, - chunks, - _input_columns, - _output_columns, - total_rows, - min_row, - uses_custom_row_bounds, - _stream, - _mr); - _stream.synchronize(); - } -} - -/** - * @copydoc cudf::io::detail::parquet::decode_page_data - */ -void reader::impl::decode_page_data(hostdevice_vector& chunks, - hostdevice_vector& pages, - hostdevice_vector& page_nesting, - size_t min_row, - size_t total_rows) -{ - auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) { - return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; - }; - - // Count the number of string dictionary entries - // NOTE: Assumes first page in the chunk is always the dictionary page - size_t total_str_dict_indexes = 0; - for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { - if (is_dict_chunk(chunks[c])) { total_str_dict_indexes += pages[page_count].num_input_values; } - page_count += chunks[c].max_num_pages; - } - - // Build index for string dictionaries since they can't be indexed - // directly due to variable-sized elements - auto str_dict_index = cudf::detail::make_zeroed_device_uvector_async( - total_str_dict_indexes, _stream); - - // TODO (dm): hd_vec should have begin and end iterator members - size_t sum_max_depths = - std::accumulate(chunks.host_ptr(), - chunks.host_ptr(chunks.size()), - 0, - [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { - return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); - }); - - // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector - // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the - // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i` - auto chunk_nested_valids = hostdevice_vector(sum_max_depths, _stream); - auto chunk_nested_data = hostdevice_vector(sum_max_depths, _stream); - auto chunk_offsets = std::vector(); - - // Update chunks with pointers to column data. - for (size_t c = 0, page_count = 0, str_ofs = 0, chunk_off = 0; c < chunks.size(); c++) { - input_column_info const& input_col = _input_columns[chunks[c].src_col_index]; - CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema, - "Column/page schema index mismatch"); - - if (is_dict_chunk(chunks[c])) { - chunks[c].str_dict_index = str_dict_index.data() + str_ofs; - str_ofs += pages[page_count].num_input_values; - } - - size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema); - chunk_offsets.push_back(chunk_off); - - // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers - // to validity data - auto valids = chunk_nested_valids.host_ptr(chunk_off); - chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off); - - // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to - // out data - auto data = chunk_nested_data.host_ptr(chunk_off); - chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off); - - chunk_off += max_depth; - - // fill in the arrays on the host. there are some important considerations to - // take into account here for nested columns. specifically, with structs - // there is sharing of output buffers between input columns. consider this schema - // - // required group field_id=1 name { - // required binary field_id=2 firstname (String); - // required binary field_id=3 middlename (String); - // required binary field_id=4 lastname (String); - // } - // - // there are 3 input columns of data here (firstname, middlename, lastname), but - // only 1 output column (name). The structure of the output column buffers looks like - // the schema itself - // - // struct (name) - // string (firstname) - // string (middlename) - // string (lastname) - // - // The struct column can contain validity information. the problem is, the decode - // step for the input columns will all attempt to decode this validity information - // because each one has it's own copy of the repetition/definition levels. but - // since this is all happening in parallel it would mean multiple blocks would - // be stomping all over the same memory randomly. to work around this, we set - // things up so that only 1 child of any given nesting level fills in the - // data (offsets in the case of lists) or validity information for the higher - // levels of the hierarchy that are shared. In this case, it would mean we - // would just choose firstname to be the one that decodes the validity for name. - // - // we do this by only handing out the pointers to the first child we come across. - // - auto* cols = &_output_columns; - for (size_t idx = 0; idx < max_depth; idx++) { - auto& out_buf = (*cols)[input_col.nesting[idx]]; - cols = &out_buf.children; - - int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; - if (owning_schema == 0 || owning_schema == input_col.schema_idx) { - valids[idx] = out_buf.null_mask(); - data[idx] = out_buf.data(); - out_buf.user_data |= - static_cast(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK; - } else { - valids[idx] = nullptr; - data[idx] = nullptr; - } - } - - // column_data_base will always point to leaf data, even for nested types. - page_count += chunks[c].max_num_pages; - } - - chunks.host_to_device(_stream); - chunk_nested_valids.host_to_device(_stream); - chunk_nested_data.host_to_device(_stream); - - if (total_str_dict_indexes > 0) { - gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); - } - - gpu::DecodePageData(pages, chunks, total_rows, min_row, _stream); - pages.device_to_host(_stream); - page_nesting.device_to_host(_stream); - _stream.synchronize(); - - // for list columns, add the final offset to every offset buffer. - // TODO : make this happen in more efficiently. Maybe use thrust::for_each - // on each buffer. Or potentially do it in PreprocessColumnData - // Note : the reason we are doing this here instead of in the decode kernel is - // that it is difficult/impossible for a given page to know that it is writing the very - // last value that should then be followed by a terminator (because rows can span - // page boundaries). - for (size_t idx = 0; idx < _input_columns.size(); idx++) { - input_column_info const& input_col = _input_columns[idx]; - - auto* cols = &_output_columns; - for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { - auto& out_buf = (*cols)[input_col.nesting[l_idx]]; - cols = &out_buf.children; - - if (out_buf.type.id() != type_id::LIST || - (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) { - continue; - } - CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column"); - auto& child = (*cols)[input_col.nesting[l_idx + 1]]; - - // the final offset for a list at level N is the size of it's child - int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), - &offset, - sizeof(offset), - cudaMemcpyHostToDevice, - _stream.value()); - out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; - } - } - - // update null counts in the final column buffers - for (size_t idx = 0; idx < pages.size(); idx++) { - gpu::PageInfo* pi = &pages[idx]; - if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } - gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; - input_column_info const& input_col = _input_columns[col->src_col_index]; - - int index = pi->nesting - page_nesting.device_ptr(); - gpu::PageNestingInfo* pni = &page_nesting[index]; - - auto* cols = &_output_columns; - for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { - auto& out_buf = (*cols)[input_col.nesting[l_idx]]; - cols = &out_buf.children; - - // if I wasn't the one who wrote out the validity bits, skip it - if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) { - continue; - } - out_buf.null_count() += pni[l_idx].null_count; - } - } - - _stream.synchronize(); -} - -reader::impl::impl(std::vector>&& sources, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _stream(stream), _mr(mr), _sources(std::move(sources)) -{ - // Open and parse the source dataset metadata - _metadata = std::make_unique(_sources); - - // Override output timestamp resolution if requested - if (options.get_timestamp_type().id() != type_id::EMPTY) { - _timestamp_type = options.get_timestamp_type(); - } - - // Strings may be returned as either string or categorical columns - _strings_to_categorical = options.is_enabled_convert_strings_to_categories(); - - // Binary columns can be read as binary or strings - _reader_column_schema = options.get_column_schema(); - - // Select only columns required by the options - std::tie(_input_columns, _output_columns, _output_column_schemas) = - _metadata->select_columns(options.get_columns(), - options.is_enabled_use_pandas_metadata(), - _strings_to_categorical, - _timestamp_type.id()); -} - -table_with_metadata reader::impl::read(size_type skip_rows, - size_type num_rows, - bool uses_custom_row_bounds, - std::vector> const& row_group_list) -{ - // Select only row groups required - const auto selected_row_groups = - _metadata->select_row_groups(row_group_list, skip_rows, num_rows); - - table_metadata out_metadata; - - // output cudf columns as determined by the top level schema - std::vector> out_columns; - out_columns.reserve(_output_columns.size()); - - if (selected_row_groups.size() != 0 && _input_columns.size() != 0) { - // Descriptors for all the chunks that make up the selected columns - const auto num_input_columns = _input_columns.size(); - const auto num_chunks = selected_row_groups.size() * num_input_columns; - hostdevice_vector chunks(0, num_chunks, _stream); - - // Association between each column chunk and its source - std::vector chunk_source_map(num_chunks); - - // Tracker for eventually deallocating compressed and uncompressed data - std::vector> page_data(num_chunks); - - // Keep track of column chunk file offsets - std::vector column_chunk_offsets(num_chunks); - - // if there are lists present, we need to preprocess - bool has_lists = false; - - // Initialize column chunk information - size_t total_decompressed_size = 0; - auto remaining_rows = num_rows; - std::vector> read_rowgroup_tasks; - for (const auto& rg : selected_row_groups) { - const auto& row_group = _metadata->get_row_group(rg.index, rg.source_index); - auto const row_group_start = rg.start_row; - auto const row_group_source = rg.source_index; - auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); - auto const io_chunk_idx = chunks.size(); - - // generate ColumnChunkDesc objects for everything to be decoded (all input columns) - for (size_t i = 0; i < num_input_columns; ++i) { - auto col = _input_columns[i]; - // look up metadata - auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); - - // this column contains repetition levels and will require a preprocess - if (schema.max_repetition_level > 0) { has_lists = true; } - - auto [type_width, clock_rate, converted_type] = - conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), - _timestamp_type.id(), - schema.type, - schema.converted_type, - schema.type_length); - - column_chunk_offsets[chunks.size()] = - (col_meta.dictionary_page_offset != 0) - ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset) - : col_meta.data_page_offset; - - chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_scale, - clock_rate, - i, - col.schema_idx)); - - // Map each column chunk to its column index and its source index - chunk_source_map[chunks.size() - 1] = row_group_source; - - if (col_meta.codec != Compression::UNCOMPRESSED) { - total_decompressed_size += col_meta.total_uncompressed_size; - } - } - // Read compressed chunk data to device memory - read_rowgroup_tasks.push_back(read_column_chunks( - page_data, chunks, io_chunk_idx, chunks.size(), column_chunk_offsets, chunk_source_map)); - - remaining_rows -= row_group.num_rows; - } - for (auto& task : read_rowgroup_tasks) { - task.wait(); - } - assert(remaining_rows <= 0); - - // Process dataset chunk pages into output columns - const auto total_pages = count_page_headers(chunks); - if (total_pages > 0) { - hostdevice_vector pages(total_pages, total_pages, _stream); - rmm::device_buffer decomp_page_data; - - // decoding of column/page information - decode_page_headers(chunks, pages); - if (total_decompressed_size > 0) { - decomp_page_data = decompress_page_data(chunks, pages); - // Free compressed data - for (size_t c = 0; c < chunks.size(); c++) { - if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); } - } - } - - // build output column info - // walk the schema, building out_buffers that mirror what our final cudf columns will look - // like. important : there is not necessarily a 1:1 mapping between input columns and output - // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct - // columns. The "structiness" is simply implied by the schema. For example, this schema: - // required group field_id=1 name { - // required binary field_id=2 firstname (String); - // required binary field_id=3 middlename (String); - // required binary field_id=4 lastname (String); - // } - // will only contain 3 columns of data (firstname, middlename, lastname). But of course - // "name" is a struct column that we want to return, so we have to make sure that we - // create it ourselves. - // std::vector output_info = build_output_column_info(); - - // nesting information (sizes, etc) stored -per page- - // note : even for flat schemas, we allocate 1 level of "nesting" info - hostdevice_vector page_nesting_info; - allocate_nesting_info(chunks, pages, page_nesting_info); - - // - compute column sizes and allocate output buffers. - // important: - // for nested schemas, we have to do some further preprocessing to determine: - // - real column output sizes per level of nesting (in a flat schema, there's only 1 level - // of - // nesting and it's size is the row count) - // - // - for nested schemas, output buffer offset values per-page, per nesting-level for the - // purposes of decoding. - preprocess_columns(chunks, pages, skip_rows, num_rows, uses_custom_row_bounds, has_lists); - - // decoding of column data itself - decode_page_data(chunks, pages, page_nesting_info, skip_rows, num_rows); - - // create the final output cudf columns - for (size_t i = 0; i < _output_columns.size(); ++i) { - column_name_info& col_name = out_metadata.schema_info.emplace_back(""); - auto const metadata = - _reader_column_schema.has_value() - ? std::make_optional((*_reader_column_schema)[i]) - : std::nullopt; - out_columns.emplace_back( - make_column(_output_columns[i], &col_name, metadata, _stream, _mr)); - } - } - } - - // Create empty columns as needed (this can happen if we've ended up with no actual data to read) - for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) { - column_name_info& col_name = out_metadata.schema_info.emplace_back(""); - out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, _stream, _mr)); - } - - // Return column names (must match order of returned columns) - out_metadata.column_names.resize(_output_columns.size()); - for (size_t i = 0; i < _output_column_schemas.size(); i++) { - auto const& schema = _metadata->get_schema(_output_column_schemas[i]); - out_metadata.column_names[i] = schema.name; - } - - // Return user metadata - out_metadata.per_file_user_data = _metadata->get_key_value_metadata(); - out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(), - out_metadata.per_file_user_data[0].end()}; - - return {std::make_unique
(std::move(out_columns)), std::move(out_metadata)}; -} - -// Forward to implementation -reader::reader(std::vector>&& sources, - parquet_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(std::move(sources), options, stream, mr)) -{ -} - -// Destructor within this translation unit -reader::~reader() = default; - -// Forward to implementation -table_with_metadata reader::read(parquet_reader_options const& options) -{ - // if the user has specified custom row bounds - bool const uses_custom_row_bounds = options.get_num_rows() >= 0 || options.get_skip_rows() != 0; - return _impl->read(options.get_skip_rows(), - options.get_num_rows(), - uses_custom_row_bounds, - options.get_row_groups()); -} - -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index e1f275bb8e8..6d42e9fab84 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -21,33 +21,23 @@ #pragma once -#include "parquet.hpp" #include "parquet_gpu.hpp" +#include "reader_impl_helpers.hpp" #include -#include #include #include #include #include +#include #include -#include -#include +#include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { -using namespace cudf::io::parquet; -using namespace cudf::io; - -// Forward declarations -class aggregate_reader_metadata; - +namespace cudf::io::detail::parquet { /** * @brief Implementation for Parquet reader */ @@ -56,6 +46,9 @@ class reader::impl { /** * @brief Constructor from an array of dataset sources with reader options. * + * By using this constructor, each call to `read()` or `read_chunk()` will perform reading the + * entire given file. + * * @param sources Dataset sources * @param options Settings for controlling reading behavior * @param stream CUDA stream used for device memory operations and kernel launches @@ -71,8 +64,8 @@ class reader::impl { * * @param skip_rows Number of rows to skip from the start * @param num_rows Number of rows to read - * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific - * bounds + * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific + * bounds * @param row_group_indices Lists of row groups to read, one per source * * @return The set of columns along with metadata @@ -80,113 +73,142 @@ class reader::impl { table_with_metadata read(size_type skip_rows, size_type num_rows, bool uses_custom_row_bounds, - std::vector> const& row_group_indices); + host_span const> row_group_indices); - private: /** - * @brief Reads compressed page data to device memory + * @brief Constructor from a chunk read limit and an array of dataset sources with reader options. + * + * By using this constructor, the reader will support iterative (chunked) reading through + * `has_next() ` and `read_chunk()`. For example: + * ``` + * do { + * auto const chunk = reader.read_chunk(); + * // Process chunk + * } while (reader.has_next()); + * + * ``` * - * @param page_data Buffers to hold compressed page data for each chunk - * @param chunks List of column chunk descriptors - * @param begin_chunk Index of first column chunk to read - * @param end_chunk Index after the last column chunk to read - * @param column_chunk_offsets File offset for all chunks + * Reading the whole given file at once through `read()` function is still supported if + * `chunk_read_limit == 0` (i.e., no reading limit). + * In such case, `read_chunk()` will also return rows of the entire file. * + * @param chunk_read_limit Limit on total number of bytes to be returned per read, + * or `0` if there is no limit + * @param sources Dataset sources + * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ + explicit impl(std::size_t chunk_read_limit, + std::vector>&& sources, + parquet_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + + /** + * @copydoc cudf::io::chunked_parquet_reader::has_next */ - std::future read_column_chunks(std::vector>& page_data, - hostdevice_vector& chunks, - size_t begin_chunk, - size_t end_chunk, - const std::vector& column_chunk_offsets, - std::vector const& chunk_source_map); + bool has_next(); /** - * @brief Returns the number of total pages from the given column chunks - * - * @param chunks List of column chunk descriptors - * - * @return The total number of pages + * @copydoc cudf::io::chunked_parquet_reader::read_chunk */ - size_t count_page_headers(hostdevice_vector& chunks); + table_with_metadata read_chunk(); + private: /** - * @brief Returns the page information from the given column chunks. + * @brief Perform the necessary data preprocessing for parsing file later on. * - * @param chunks List of column chunk descriptors - * @param pages List of page information + * @param skip_rows Number of rows to skip from the start + * @param num_rows Number of rows to read, or `-1` to read all rows + * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific + * bounds + * @param row_group_indices Lists of row groups to read (one per source), or empty if read all */ - void decode_page_headers(hostdevice_vector& chunks, - hostdevice_vector& pages); + void prepare_data(size_type skip_rows, + size_type num_rows, + bool uses_custom_row_bounds, + host_span const> row_group_indices); /** - * @brief Decompresses the page data, at page granularity. + * @brief Load and decompress the input file(s) into memory. + */ + void load_and_decompress_data(std::vector const& row_groups_info, + size_type num_rows); + + /** + * @brief Perform some preprocessing for page data and also compute the split locations + * {skip_rows, num_rows} for chunked reading. + * + * There are several pieces of information we can't compute directly from row counts in + * the parquet headers when dealing with nested schemas: + * - The total sizes of all output columns at all nesting levels + * - The starting output buffer offset for each page, for each nesting level * - * @param chunks List of column chunk descriptors - * @param pages List of page information + * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders). * - * @return Device buffer to decompressed page data + * @param skip_rows Crop all rows below skip_rows + * @param num_rows Maximum number of rows to read + * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific + * bounds + * @param chunk_read_limit Limit on total number of bytes to be returned per read, + * or `0` if there is no limit */ - rmm::device_buffer decompress_page_data(hostdevice_vector& chunks, - hostdevice_vector& pages); + void preprocess_pages(size_t skip_rows, + size_t num_rows, + bool uses_custom_row_bounds, + size_t chunk_read_limit); /** - * @brief Allocate nesting information storage for all pages and set pointers - * to it. + * @brief Allocate nesting information storage for all pages and set pointers to it. * * One large contiguous buffer of PageNestingInfo structs is allocated and * distributed among the PageInfo structs. * * Note that this gets called even in the flat schema case so that we have a * consistent place to store common information such as value counts, etc. - * - * @param chunks List of column chunk descriptors - * @param pages List of page information - * @param page_nesting_info The allocated nesting info structs. */ - void allocate_nesting_info(hostdevice_vector const& chunks, - hostdevice_vector& pages, - hostdevice_vector& page_nesting_info); + void allocate_nesting_info(); /** - * @brief Preprocess column information for nested schemas. + * @brief Read a chunk of data and return an output table. * - * There are several pieces of information we can't compute directly from row counts in - * the parquet headers when dealing with nested schemas. - * - The total sizes of all output columns at all nesting levels - * - The starting output buffer offset for each page, for each nesting level + * This function is called internally and expects all preprocessing steps have already been done. * - * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders) + * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific + * bounds + * @return The output table along with columns' metadata + */ + table_with_metadata read_chunk_internal(bool uses_custom_row_bounds); + + /** + * @brief Finalize the output table by adding empty columns for the non-selected columns in + * schema. + * + * @param out_metadata The output table metadata + * @param out_columns The columns for building the output table + * @return The output table along with columns' metadata + */ + table_with_metadata finalize_output(table_metadata& out_metadata, + std::vector>& out_columns); + + /** + * @brief Allocate data bufers for the output columns. * - * @param chunks All chunks to be decoded - * @param pages All pages to be decoded - * @param min_rows crop all rows below min_row - * @param total_rows Maximum number of rows to read - * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific - * bounds - * @param has_lists Whether or not this data contains lists and requires - * a preprocess. + * @param skip_rows Crop all rows below skip_rows + * @param num_rows Maximum number of rows to read + * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific + * bounds */ - void preprocess_columns(hostdevice_vector& chunks, - hostdevice_vector& pages, - size_t min_row, - size_t total_rows, - bool uses_custom_row_bounds, - bool has_lists); + void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds); /** * @brief Converts the page data and outputs to columns. * - * @param chunks List of column chunk descriptors - * @param pages List of page information - * @param page_nesting Page nesting array - * @param min_row Minimum number of rows from start - * @param total_rows Number of rows to output + * @param skip_rows Minimum number of rows from start + * @param num_rows Number of rows to output */ - void decode_page_data(hostdevice_vector& chunks, - hostdevice_vector& pages, - hostdevice_vector& page_nesting, - size_t min_row, - size_t total_rows); + void decode_page_data(size_t skip_rows, size_t num_rows); private: rmm::cuda_stream_view _stream; @@ -197,17 +219,30 @@ class reader::impl { // input columns to be processed std::vector _input_columns; - // output columns to be generated - std::vector _output_columns; - // _output_columns associated schema indices + + // Buffers for generating output columns + std::vector _output_buffers; + + // Buffers copied from `_output_buffers` after construction for reuse + std::vector _output_buffers_template; + + // _output_buffers associated schema indices std::vector _output_column_schemas; + // _output_buffers associated metadata + std::unique_ptr _output_metadata; + bool _strings_to_categorical = false; std::optional> _reader_column_schema; data_type _timestamp_type{type_id::EMPTY}; + + // Variables used for chunked reading: + cudf::io::parquet::gpu::file_intermediate_data _file_itm_data; + cudf::io::parquet::gpu::chunk_intermediate_data _chunk_itm_data; + std::vector _chunk_read_info; + std::size_t _chunk_read_limit{0}; + std::size_t _current_read_chunk{0}; + bool _file_preprocessed{false}; }; -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::detail::parquet diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp new file mode 100644 index 00000000000..7090df2cae0 --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -0,0 +1,629 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl_helpers.hpp" + +#include +#include + +namespace cudf::io::detail::parquet { + +namespace { + +ConvertedType logical_type_to_converted_type(LogicalType const& logical) +{ + if (logical.isset.STRING) { + return parquet::UTF8; + } else if (logical.isset.MAP) { + return parquet::MAP; + } else if (logical.isset.LIST) { + return parquet::LIST; + } else if (logical.isset.ENUM) { + return parquet::ENUM; + } else if (logical.isset.DECIMAL) { + return parquet::DECIMAL; // TODO set decimal values + } else if (logical.isset.DATE) { + return parquet::DATE; + } else if (logical.isset.TIME) { + if (logical.TIME.unit.isset.MILLIS) + return parquet::TIME_MILLIS; + else if (logical.TIME.unit.isset.MICROS) + return parquet::TIME_MICROS; + } else if (logical.isset.TIMESTAMP) { + if (logical.TIMESTAMP.unit.isset.MILLIS) + return parquet::TIMESTAMP_MILLIS; + else if (logical.TIMESTAMP.unit.isset.MICROS) + return parquet::TIMESTAMP_MICROS; + } else if (logical.isset.INTEGER) { + switch (logical.INTEGER.bitWidth) { + case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; + case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16; + case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32; + case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64; + default: break; + } + } else if (logical.isset.UNKNOWN) { + return parquet::NA; + } else if (logical.isset.JSON) { + return parquet::JSON; + } else if (logical.isset.BSON) { + return parquet::BSON; + } + return parquet::UNKNOWN; +} + +} // namespace + +/** + * @brief Function that translates Parquet datatype to cuDF type enum + */ +type_id to_type_id(SchemaElement const& schema, + bool strings_to_categorical, + type_id timestamp_type_id) +{ + parquet::Type const physical = schema.type; + parquet::LogicalType const logical_type = schema.logical_type; + parquet::ConvertedType converted_type = schema.converted_type; + int32_t decimal_scale = schema.decimal_scale; + + // Logical type used for actual data interpretation; the legacy converted type + // is superceded by 'logical' type whenever available. + auto const inferred_converted_type = logical_type_to_converted_type(logical_type); + if (inferred_converted_type != parquet::UNKNOWN) converted_type = inferred_converted_type; + if (inferred_converted_type == parquet::DECIMAL && decimal_scale == 0) + decimal_scale = schema.logical_type.DECIMAL.scale; + + switch (converted_type) { + case parquet::UINT_8: return type_id::UINT8; + case parquet::INT_8: return type_id::INT8; + case parquet::UINT_16: return type_id::UINT16; + case parquet::INT_16: return type_id::INT16; + case parquet::UINT_32: return type_id::UINT32; + case parquet::UINT_64: return type_id::UINT64; + case parquet::DATE: return type_id::TIMESTAMP_DAYS; + case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS; + case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS; + case parquet::TIMESTAMP_MILLIS: + return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id + : type_id::TIMESTAMP_MILLISECONDS; + case parquet::TIMESTAMP_MICROS: + return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id + : type_id::TIMESTAMP_MICROSECONDS; + case parquet::DECIMAL: + if (physical == parquet::INT32) { return type_id::DECIMAL32; } + if (physical == parquet::INT64) { return type_id::DECIMAL64; } + if (physical == parquet::FIXED_LEN_BYTE_ARRAY) { + if (schema.type_length <= static_cast(sizeof(int32_t))) { + return type_id::DECIMAL32; + } + if (schema.type_length <= static_cast(sizeof(int64_t))) { + return type_id::DECIMAL64; + } + if (schema.type_length <= static_cast(sizeof(__int128_t))) { + return type_id::DECIMAL128; + } + } + CUDF_FAIL("Invalid representation of decimal type"); + break; + + // maps are just List>. + case parquet::MAP: + case parquet::LIST: return type_id::LIST; + case parquet::NA: return type_id::STRING; + // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support + default: break; + } + + if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + logical_type.TIMESTAMP.unit.isset.NANOS) { + return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id + : type_id::TIMESTAMP_NANOSECONDS; + } + + if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + logical_type.TIME.unit.isset.NANOS) { + return type_id::DURATION_NANOSECONDS; + } + + // is it simply a struct? + if (schema.is_struct()) { return type_id::STRUCT; } + + // Physical storage type supported by Parquet; controls the on-disk storage + // format in combination with the encoding type. + switch (physical) { + case parquet::BOOLEAN: return type_id::BOOL8; + case parquet::INT32: return type_id::INT32; + case parquet::INT64: return type_id::INT64; + case parquet::FLOAT: return type_id::FLOAT32; + case parquet::DOUBLE: return type_id::FLOAT64; + case parquet::BYTE_ARRAY: + case parquet::FIXED_LEN_BYTE_ARRAY: + // Can be mapped to INT32 (32-bit hash) or STRING + return strings_to_categorical ? type_id::INT32 : type_id::STRING; + case parquet::INT96: + return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id + : type_id::TIMESTAMP_NANOSECONDS; + default: break; + } + + return type_id::EMPTY; +} + +metadata::metadata(datasource* source) +{ + constexpr auto header_len = sizeof(file_header_s); + constexpr auto ender_len = sizeof(file_ender_s); + + const auto len = source->size(); + const auto header_buffer = source->host_read(0, header_len); + const auto header = reinterpret_cast(header_buffer->data()); + const auto ender_buffer = source->host_read(len - ender_len, ender_len); + const auto ender = reinterpret_cast(ender_buffer->data()); + CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source"); + CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic, + "Corrupted header or footer"); + CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len), + "Incorrect footer length"); + + const auto buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); + CompactProtocolReader cp(buffer->data(), ender->footer_len); + CUDF_EXPECTS(cp.read(this), "Cannot parse metadata"); + CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); +} + +std::vector aggregate_reader_metadata::metadatas_from_sources( + std::vector> const& sources) +{ + std::vector metadatas; + std::transform( + sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) { + return metadata(source.get()); + }); + return metadatas; +} + +std::vector> +aggregate_reader_metadata::collect_keyval_metadata() const +{ + std::vector> kv_maps; + std::transform(per_file_metadata.cbegin(), + per_file_metadata.cend(), + std::back_inserter(kv_maps), + [](auto const& pfm) { + std::unordered_map kv_map; + std::transform(pfm.key_value_metadata.cbegin(), + pfm.key_value_metadata.cend(), + std::inserter(kv_map, kv_map.end()), + [](auto const& kv) { + return std::pair{kv.key, kv.value}; + }); + return kv_map; + }); + + return kv_maps; +} + +size_type aggregate_reader_metadata::calc_num_rows() const +{ + return std::accumulate( + per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { + return sum + pfm.num_rows; + }); +} + +size_type aggregate_reader_metadata::calc_num_row_groups() const +{ + return std::accumulate( + per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) { + return sum + pfm.row_groups.size(); + }); +} + +aggregate_reader_metadata::aggregate_reader_metadata( + std::vector> const& sources) + : per_file_metadata(metadatas_from_sources(sources)), + keyval_maps(collect_keyval_metadata()), + num_rows(calc_num_rows()), + num_row_groups(calc_num_row_groups()) +{ + if (per_file_metadata.size() > 0) { + auto const& first_meta = per_file_metadata.front(); + auto const num_cols = + first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0; + auto const& schema = first_meta.schema; + + // Verify that the input files have matching numbers of columns and schema. + for (auto const& pfm : per_file_metadata) { + if (pfm.row_groups.size() > 0) { + CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(), + "All sources must have the same number of columns"); + } + CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema"); + } + } +} + +RowGroup const& aggregate_reader_metadata::get_row_group(size_type row_group_index, + size_type src_idx) const +{ + CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast(per_file_metadata.size()), + "invalid source index"); + return per_file_metadata[src_idx].row_groups[row_group_index]; +} + +ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_type row_group_index, + size_type src_idx, + int schema_idx) const +{ + auto col = std::find_if( + per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(), + per_file_metadata[src_idx].row_groups[row_group_index].columns.end(), + [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; }); + CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns), + "Found no metadata for schema index"); + return col->meta_data; +} + +std::string aggregate_reader_metadata::get_pandas_index() const +{ + // Assumes that all input files have the same metadata + // TODO: verify this assumption + auto it = keyval_maps[0].find("pandas"); + if (it != keyval_maps[0].end()) { + // Captures a list of quoted strings found inside square brackets after `"index_columns":` + // Inside quotes supports newlines, brackets, escaped quotes, etc. + // One-liner regex: + // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\] + // Documented below. + std::regex index_columns_expr{ + R"("index_columns"\s*:\s*\[\s*)" // match preamble, opening square bracket, whitespace + R"(()" // Open first capturing group + R"((?:")" // Open non-capturing group match opening quote + R"((?:|(?:.*?(?![^\\]")).?))" // match empty string or anything between quotes + R"([^\\]?")" // Match closing non-escaped quote + R"(,?\s*)" // Match optional comma and whitespace + R"()*)" // Close non-capturing group and repeat 0 or more times + R"())" // Close first capturing group + R"(\])" // Match closing square brackets + }; + std::smatch sm; + if (std::regex_search(it->second, sm, index_columns_expr)) { return sm[1].str(); } + } + return ""; +} + +std::vector aggregate_reader_metadata::get_pandas_index_names() const +{ + std::vector names; + auto str = get_pandas_index(); + if (str.length() != 0) { + std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"}; + std::smatch sm; + while (std::regex_search(str, sm, index_name_expr)) { + if (sm.size() == 2) { // 2 = whole match, first item + if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) { + std::regex esc_quote{R"(\\")"}; + names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")")); + } + } + str = sm.suffix(); + } + } + return names; +} + +std::tuple> +aggregate_reader_metadata::select_row_groups( + host_span const> row_group_indices, + size_type row_start, + size_type row_count) const +{ + std::vector selection; + + if (!row_group_indices.empty()) { + CUDF_EXPECTS(row_group_indices.size() == per_file_metadata.size(), + "Must specify row groups for each source"); + + row_count = 0; + for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) { + for (auto const& rowgroup_idx : row_group_indices[src_idx]) { + CUDF_EXPECTS( + rowgroup_idx >= 0 && + rowgroup_idx < static_cast(per_file_metadata[src_idx].row_groups.size()), + "Invalid rowgroup index"); + selection.emplace_back(rowgroup_idx, row_count, src_idx); + row_count += get_row_group(rowgroup_idx, src_idx).num_rows; + } + } + + return {row_start, row_count, std::move(selection)}; + } + + row_start = std::max(row_start, 0); + if (row_count < 0) { + row_count = std::min(get_num_rows(), std::numeric_limits::max()); + } + row_count = std::min(row_count, get_num_rows() - row_start); + CUDF_EXPECTS(row_count >= 0, "Invalid row count"); + CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start"); + + size_type count = 0; + for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) { + for (size_t rg_idx = 0; rg_idx < per_file_metadata[src_idx].row_groups.size(); ++rg_idx) { + auto const chunk_start_row = count; + count += get_row_group(rg_idx, src_idx).num_rows; + if (count > row_start || count == 0) { + selection.emplace_back(rg_idx, chunk_start_row, src_idx); + } + if (count >= row_start + row_count) { break; } + } + } + + return {row_start, row_count, std::move(selection)}; +} + +std::tuple, std::vector, std::vector> +aggregate_reader_metadata::select_columns(std::optional> const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const +{ + auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) { + auto const& col_schema_idx = + std::find_if(schema_elem.children_idx.cbegin(), + schema_elem.children_idx.cend(), + [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; }); + + return (col_schema_idx != schema_elem.children_idx.end()) + ? static_cast(*col_schema_idx) + : -1; + }; + + std::vector output_columns; + std::vector input_columns; + std::vector nesting; + + // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is + // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is + // not a child of "struct1" then the function will return false for "struct1" + std::function&, bool)> + build_column = [&](column_name_info const* col_name_info, + int schema_idx, + std::vector& out_col_array, + bool has_list_parent) { + if (schema_idx < 0) { return false; } + auto const& schema_elem = get_schema(schema_idx); + + // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer + // hierarchy. So continue on + if (schema_elem.is_stub()) { + // is this legit? + CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub"); + auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr; + return build_column( + child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent); + } + + // if we're at the root, this is a new output column + auto const col_type = schema_elem.is_one_level_list() + ? type_id::LIST + : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); + auto const dtype = to_data_type(col_type, schema_elem); + + column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } + // store the index of this element if inserted in out_col_array + nesting.push_back(static_cast(out_col_array.size())); + output_col.name = schema_elem.name; + + // build each child + bool path_is_valid = false; + if (col_name_info == nullptr or col_name_info->children.empty()) { + // add all children of schema_elem. + // At this point, we can no longer pass a col_name_info to build_column + for (int idx = 0; idx < schema_elem.num_children; idx++) { + path_is_valid |= build_column(nullptr, + schema_elem.children_idx[idx], + output_col.children, + has_list_parent || col_type == type_id::LIST); + } + } else { + for (size_t idx = 0; idx < col_name_info->children.size(); idx++) { + path_is_valid |= + build_column(&col_name_info->children[idx], + find_schema_child(schema_elem, col_name_info->children[idx].name), + output_col.children, + has_list_parent || col_type == type_id::LIST); + } + } + + // if I have no children, we're at a leaf and I'm an input column (that is, one with actual + // data stored) so add me to the list. + if (schema_elem.num_children == 0) { + input_column_info& input_col = input_columns.emplace_back( + input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0}); + + // set up child output column for one-level encoding list + if (schema_elem.is_one_level_list()) { + // determine the element data type + auto const element_type = + to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); + auto const element_dtype = to_data_type(element_type, schema_elem); + + column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); + if (has_list_parent || col_type == type_id::LIST) { + element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; + } + // store the index of this element + nesting.push_back(static_cast(output_col.children.size())); + // TODO: not sure if we should assign a name or leave it blank + element_col.name = "element"; + + output_col.children.push_back(std::move(element_col)); + } + + std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); + + // pop off the extra nesting element. + if (schema_elem.is_one_level_list()) { nesting.pop_back(); } + + path_is_valid = true; // If we're able to reach leaf then path is valid + } + + if (path_is_valid) { out_col_array.push_back(std::move(output_col)); } + + nesting.pop_back(); + return path_is_valid; + }; + + std::vector output_column_schemas; + + // + // there is not necessarily a 1:1 mapping between input columns and output columns. + // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns. + // The "structiness" is simply implied by the schema. For example, this schema: + // required group field_id=1 name { + // required binary field_id=2 firstname (String); + // required binary field_id=3 middlename (String); + // required binary field_id=4 lastname (String); + // } + // will only contain 3 internal columns of data (firstname, middlename, lastname). But of + // course "name" is ultimately the struct column we want to return. + // + // "firstname", "middlename" and "lastname" represent the input columns in the file that we + // process to produce the final cudf "name" column. + // + // A user can ask for a single field out of the struct e.g. firstname. + // In this case they'll pass a fully qualified name to the schema element like + // ["name", "firstname"] + // + auto const& root = get_schema(0); + if (not use_names.has_value()) { + for (auto const& schema_idx : root.children_idx) { + build_column(nullptr, schema_idx, output_columns, false); + output_column_schemas.push_back(schema_idx); + } + } else { + struct path_info { + std::string full_path; + int schema_idx; + }; + + // Convert schema into a vector of every possible path + std::vector all_paths; + std::function add_path = [&](std::string path_till_now, + int schema_idx) { + auto const& schema_elem = get_schema(schema_idx); + std::string curr_path = path_till_now + schema_elem.name; + all_paths.push_back({curr_path, schema_idx}); + for (auto const& child_idx : schema_elem.children_idx) { + add_path(curr_path + ".", child_idx); + } + }; + for (auto const& child_idx : get_schema(0).children_idx) { + add_path("", child_idx); + } + + // Find which of the selected paths are valid and get their schema index + std::vector valid_selected_paths; + for (auto const& selected_path : *use_names) { + auto found_path = + std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) { + return valid_path.full_path == selected_path; + }); + if (found_path != all_paths.end()) { + valid_selected_paths.push_back({selected_path, found_path->schema_idx}); + } + } + + // Now construct paths as vector of strings for further consumption + std::vector> use_names3; + std::transform(valid_selected_paths.begin(), + valid_selected_paths.end(), + std::back_inserter(use_names3), + [&](path_info const& valid_path) { + auto schema_idx = valid_path.schema_idx; + std::vector result_path; + do { + SchemaElement const& elem = get_schema(schema_idx); + result_path.push_back(elem.name); + schema_idx = elem.parent_idx; + } while (schema_idx > 0); + return std::vector(result_path.rbegin(), result_path.rend()); + }); + + std::vector selected_columns; + if (include_index) { + std::vector index_names = get_pandas_index_names(); + std::transform(index_names.cbegin(), + index_names.cend(), + std::back_inserter(selected_columns), + [](std::string const& name) { return column_name_info(name); }); + } + // Merge the vector use_names into a set of hierarchical column_name_info objects + /* This is because if we have columns like this: + * col1 + * / \ + * s3 f4 + * / \ + * f5 f6 + * + * there may be common paths in use_names like: + * {"col1", "s3", "f5"}, {"col1", "f4"} + * which means we want the output to contain + * col1 + * / \ + * s3 f4 + * / + * f5 + * + * rather than + * col1 col1 + * | | + * s3 f4 + * | + * f5 + */ + for (auto const& path : use_names3) { + auto array_to_find_in = &selected_columns; + for (size_t depth = 0; depth < path.size(); ++depth) { + // Check if the path exists in our selected_columns and if not, add it. + auto const& name_to_find = path[depth]; + auto found_col = std::find_if( + array_to_find_in->begin(), + array_to_find_in->end(), + [&name_to_find](column_name_info const& col) { return col.name == name_to_find; }); + if (found_col == array_to_find_in->end()) { + auto& col = array_to_find_in->emplace_back(name_to_find); + array_to_find_in = &col.children; + } else { + // Path exists. go down further. + array_to_find_in = &found_col->children; + } + } + } + for (auto& col : selected_columns) { + auto const& top_level_col_schema_idx = find_schema_child(root, col.name); + bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false); + if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx); + } + } + + return std::make_tuple( + std::move(input_columns), std::move(output_columns), std::move(output_column_schemas)); +} + +} // namespace cudf::io::detail::parquet diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp new file mode 100644 index 00000000000..6fa86a77e46 --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "compact_protocol_reader.hpp" +#include "parquet_gpu.hpp" + +#include +#include +#include + +#include +#include + +namespace cudf::io::detail::parquet { + +using namespace cudf::io::parquet; + +/** + * @brief Function that translates Parquet datatype to cuDF type enum + */ +[[nodiscard]] type_id to_type_id(SchemaElement const& schema, + bool strings_to_categorical, + type_id timestamp_type_id); + +/** + * @brief Converts cuDF type enum to column logical type + */ +[[nodiscard]] inline data_type to_data_type(type_id t_id, SchemaElement const& schema) +{ + return t_id == type_id::DECIMAL32 || t_id == type_id::DECIMAL64 || t_id == type_id::DECIMAL128 + ? data_type{t_id, numeric::scale_type{-schema.decimal_scale}} + : data_type{t_id}; +} + +/** + * @brief The row_group_info class + */ +struct row_group_info { + size_type const index; + size_t const start_row; // TODO source index + size_type const source_index; + row_group_info(size_type index, size_t start_row, size_type source_index) + : index(index), start_row(start_row), source_index(source_index) + { + } +}; + +/** + * @brief Class for parsing dataset metadata + */ +struct metadata : public FileMetaData { + explicit metadata(datasource* source); +}; + +class aggregate_reader_metadata { + std::vector per_file_metadata; + std::vector> keyval_maps; + size_type num_rows; + size_type num_row_groups; + + /** + * @brief Create a metadata object from each element in the source vector + */ + static std::vector metadatas_from_sources( + std::vector> const& sources); + + /** + * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps. + */ + [[nodiscard]] std::vector> collect_keyval_metadata() + const; + + /** + * @brief Sums up the number of rows of each source + */ + [[nodiscard]] size_type calc_num_rows() const; + + /** + * @brief Sums up the number of row groups of each source + */ + [[nodiscard]] size_type calc_num_row_groups() const; + + public: + aggregate_reader_metadata(std::vector> const& sources); + + [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const; + + [[nodiscard]] ColumnChunkMetaData const& get_column_metadata(size_type row_group_index, + size_type src_idx, + int schema_idx) const; + + [[nodiscard]] auto get_num_rows() const { return num_rows; } + + [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; } + + [[nodiscard]] auto const& get_schema(int schema_idx) const + { + return per_file_metadata[0].schema[schema_idx]; + } + + [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; } + + /** + * @brief Gets the concrete nesting depth of output cudf columns + * + * @param schema_index Schema index of the input column + * + * @return comma-separated index column names in quotes + */ + [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const + { + auto& pfm = per_file_metadata[0]; + int depth = 0; + + // walk upwards, skipping repeated fields + while (schema_index > 0) { + if (!pfm.schema[schema_index].is_stub()) { depth++; } + // schema of one-level encoding list doesn't contain nesting information, so we need to + // manually add an extra nesting level + if (pfm.schema[schema_index].is_one_level_list()) { depth++; } + schema_index = pfm.schema[schema_index].parent_idx; + } + return depth; + } + + /** + * @brief Extracts the pandas "index_columns" section + * + * PANDAS adds its own metadata to the key_value section when writing out the + * dataframe to a file to aid in exact reconstruction. The JSON-formatted + * metadata contains the index column(s) and PANDA-specific datatypes. + * + * @return comma-separated index column names in quotes + */ + [[nodiscard]] std::string get_pandas_index() const; + + /** + * @brief Extracts the column name(s) used for the row indexes in a dataframe + * + * @param names List of column names to load, where index column name(s) will be added + */ + [[nodiscard]] std::vector get_pandas_index_names() const; + + /** + * @brief Filters and reduces down to a selection of row groups + * + * The input `row_start` and `row_count` parameters will be recomputed and output as the valid + * values based on the input row group list. + * + * @param row_group_indices Lists of row groups to read, one per source + * @param row_start Starting row of the selection + * @param row_count Total number of rows selected + * + * @return A tuple of corrected row_start, row_count and list of row group indexes and its + * starting row + */ + [[nodiscard]] std::tuple> select_row_groups( + host_span const> row_group_indices, + size_type row_start, + size_type row_count) const; + + /** + * @brief Filters and reduces down to a selection of columns + * + * @param use_names List of paths of column names to select; `nullopt` if user did not select + * columns to read + * @param include_index Whether to always include the PANDAS index column(s) + * @param strings_to_categorical Type conversion parameter + * @param timestamp_type_id Type conversion parameter + * + * @return input column information, output column information, list of output column schema + * indices + */ + [[nodiscard]] std:: + tuple, std::vector, std::vector> + select_columns(std::optional> const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const; +}; + +} // namespace cudf::io::detail::parquet diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu new file mode 100644 index 00000000000..38fce7d3263 --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -0,0 +1,1527 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl.hpp" + +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace cudf::io::detail::parquet { +namespace { + +/** + * @brief Generate depth remappings for repetition and definition levels. + * + * When dealing with columns that contain lists, we must examine incoming + * repetition and definition level pairs to determine what range of output nesting + * is indicated when adding new values. This function generates the mappings of + * the R/D levels to those start/end bounds + * + * @param remap Maps column schema index to the R/D remapping vectors for that column + * @param src_col_schema The column schema to generate the new mapping for + * @param md File metadata information + */ +void generate_depth_remappings(std::map, std::vector>>& remap, + int src_col_schema, + aggregate_reader_metadata const& md) +{ + // already generated for this level + if (remap.find(src_col_schema) != remap.end()) { return; } + auto schema = md.get_schema(src_col_schema); + int max_depth = md.get_output_nesting_depth(src_col_schema); + + CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(), + "Attempting to remap a schema more than once"); + auto inserted = + remap.insert(std::pair, std::vector>>{src_col_schema, {}}); + auto& depth_remap = inserted.first->second; + + std::vector& rep_depth_remap = (depth_remap.first); + rep_depth_remap.resize(schema.max_repetition_level + 1); + std::vector& def_depth_remap = (depth_remap.second); + def_depth_remap.resize(schema.max_definition_level + 1); + + // the key: + // for incoming level values R/D + // add values starting at the shallowest nesting level X has repetition level R + // until you reach the deepest nesting level Y that corresponds to the repetition level R1 + // held by the nesting level that has definition level D + // + // Example: a 3 level struct with a list at the bottom + // + // R / D Depth + // level0 0 / 1 0 + // level1 0 / 2 1 + // level2 0 / 3 2 + // list 0 / 3 3 + // element 1 / 4 4 + // + // incoming R/D : 0, 0 -> add values from depth 0 to 3 (def level 0 always maps to depth 0) + // incoming R/D : 0, 1 -> add values from depth 0 to 3 + // incoming R/D : 0, 2 -> add values from depth 0 to 3 + // incoming R/D : 1, 4 -> add values from depth 4 to 4 + // + // Note : the -validity- of values is simply checked by comparing the incoming D value against the + // D value of the given nesting level (incoming D >= the D for the nesting level == valid, + // otherwise NULL). The tricky part is determining what nesting levels to add values at. + // + // For schemas with no repetition level (no lists), X is always 0 and Y is always max nesting + // depth. + // + + // compute "X" from above + for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) { + auto find_shallowest = [&](int r) { + int shallowest = -1; + int cur_depth = max_depth - 1; + int schema_idx = src_col_schema; + while (schema_idx > 0) { + auto cur_schema = md.get_schema(schema_idx); + if (cur_schema.max_repetition_level == r) { + // if this is a repeated field, map it one level deeper + shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth; + } + // if it's one-level encoding list + else if (cur_schema.is_one_level_list()) { + shallowest = cur_depth - 1; + } + if (!cur_schema.is_stub()) { cur_depth--; } + schema_idx = cur_schema.parent_idx; + } + return shallowest; + }; + rep_depth_remap[s_idx] = find_shallowest(s_idx); + } + + // compute "Y" from above + for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) { + auto find_deepest = [&](int d) { + SchemaElement prev_schema; + int schema_idx = src_col_schema; + int r1 = 0; + while (schema_idx > 0) { + SchemaElement cur_schema = md.get_schema(schema_idx); + if (cur_schema.max_definition_level == d) { + // if this is a repeated field, map it one level deeper + r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level + : cur_schema.max_repetition_level; + break; + } + prev_schema = cur_schema; + schema_idx = cur_schema.parent_idx; + } + + // we now know R1 from above. return the deepest nesting level that has the + // same repetition level + schema_idx = src_col_schema; + int depth = max_depth - 1; + while (schema_idx > 0) { + SchemaElement cur_schema = md.get_schema(schema_idx); + if (cur_schema.max_repetition_level == r1) { + // if this is a repeated field, map it one level deeper + depth = cur_schema.is_stub() ? depth + 1 : depth; + break; + } + if (!cur_schema.is_stub()) { depth--; } + prev_schema = cur_schema; + schema_idx = cur_schema.parent_idx; + } + return depth; + }; + def_depth_remap[s_idx] = find_deepest(s_idx); + } +} + +/** + * @brief Return the required number of bits to store a value. + */ +template +[[nodiscard]] T required_bits(uint32_t max_level) +{ + return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); +} + +/** + * @brief Converts cuDF units to Parquet units. + * + * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + */ +[[nodiscard]] std::tuple conversion_info(type_id column_type_id, + type_id timestamp_type_id, + parquet::Type physical, + int8_t converted, + int32_t length) +{ + int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t clock_rate = 0; + if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { + type_width = 1; // I32 -> I8 + } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { + type_width = 2; // I32 -> I16 + } else if (column_type_id == type_id::INT32) { + type_width = 4; // str -> hash32 + } else if (is_chrono(data_type{column_type_id})) { + clock_rate = to_clockrate(timestamp_type_id); + } + + int8_t converted_type = converted; + if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && + not cudf::is_fixed_point(data_type{column_type_id})) { + converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal + } + return std::make_tuple(type_width, clock_rate, converted_type); +} + +/** + * @brief Reads compressed page data to device memory. + * + * @param sources Dataset sources + * @param page_data Buffers to hold compressed page data for each chunk + * @param chunks List of column chunk descriptors + * @param begin_chunk Index of first column chunk to read + * @param end_chunk Index after the last column chunk to read + * @param column_chunk_offsets File offset for all chunks + * @param chunk_source_map Association between each column chunk and its source + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A future object for reading synchronization + */ +[[nodiscard]] std::future read_column_chunks_async( + std::vector> const& sources, + std::vector>& page_data, + hostdevice_vector& chunks, + size_t begin_chunk, + size_t end_chunk, + const std::vector& column_chunk_offsets, + std::vector const& chunk_source_map, + rmm::cuda_stream_view stream) +{ + // Transfer chunk data, coalescing adjacent chunks + std::vector> read_tasks; + for (size_t chunk = begin_chunk; chunk < end_chunk;) { + const size_t io_offset = column_chunk_offsets[chunk]; + size_t io_size = chunks[chunk].compressed_size; + size_t next_chunk = chunk + 1; + const bool is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED); + while (next_chunk < end_chunk) { + const size_t next_offset = column_chunk_offsets[next_chunk]; + const bool is_next_compressed = + (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED); + if (next_offset != io_offset + io_size || is_next_compressed != is_compressed) { + // Can't merge if not contiguous or mixing compressed and uncompressed + // Not coalescing uncompressed with compressed chunks is so that compressed buffers can be + // freed earlier (immediately after decompression stage) to limit peak memory requirements + break; + } + io_size += chunks[next_chunk].compressed_size; + next_chunk++; + } + if (io_size != 0) { + auto& source = sources[chunk_source_map[chunk]]; + if (source->is_device_read_preferred(io_size)) { + auto buffer = rmm::device_buffer(io_size, stream); + auto fut_read_size = source->device_read_async( + io_offset, io_size, static_cast(buffer.data()), stream); + read_tasks.emplace_back(std::move(fut_read_size)); + page_data[chunk] = datasource::buffer::create(std::move(buffer)); + } else { + auto const buffer = source->host_read(io_offset, io_size); + page_data[chunk] = + datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream)); + } + auto d_compdata = page_data[chunk]->data(); + do { + chunks[chunk].compressed_data = d_compdata; + d_compdata += chunks[chunk].compressed_size; + } while (++chunk != next_chunk); + } else { + chunk = next_chunk; + } + } + auto sync_fn = [](decltype(read_tasks) read_tasks) { + for (auto& task : read_tasks) { + task.wait(); + } + }; + return std::async(std::launch::deferred, sync_fn, std::move(read_tasks)); +} + +/** + * @brief Return the number of total pages from the given column chunks. + * + * @param chunks List of column chunk descriptors + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return The total number of pages + */ +[[nodiscard]] size_t count_page_headers(hostdevice_vector& chunks, + rmm::cuda_stream_view stream) +{ + size_t total_pages = 0; + + chunks.host_to_device(stream); + gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + chunks.device_to_host(stream, true); + + for (size_t c = 0; c < chunks.size(); c++) { + total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages; + } + + return total_pages; +} + +/** + * @brief Decode the page information from the given column chunks. + * + * @param chunks List of column chunk descriptors + * @param pages List of page information + * @param stream CUDA stream used for device memory operations and kernel launches + */ +void decode_page_headers(hostdevice_vector& chunks, + hostdevice_vector& pages, + rmm::cuda_stream_view stream) +{ + // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), + // please update preprocess_nested_columns to reflect this. + for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { + chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages; + chunks[c].page_info = pages.device_ptr(page_count); + page_count += chunks[c].max_num_pages; + } + + chunks.host_to_device(stream); + gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + pages.device_to_host(stream, true); +} + +/** + * @brief Decompresses the page data, at page granularity. + * + * @param chunks List of column chunk descriptors + * @param pages List of page information + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Device buffer to decompressed page data + */ +[[nodiscard]] rmm::device_buffer decompress_page_data( + hostdevice_vector& chunks, + hostdevice_vector& pages, + rmm::cuda_stream_view stream) +{ + auto for_each_codec_page = [&](parquet::Compression codec, const std::function& f) { + for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { + const auto page_stride = chunks[c].max_num_pages; + if (chunks[c].codec == codec) { + for (int k = 0; k < page_stride; k++) { + f(page_count + k); + } + } + page_count += page_stride; + } + }; + + // Brotli scratch memory for decompressing + rmm::device_buffer debrotli_scratch; + + // Count the exact number of compressed pages + size_t num_comp_pages = 0; + size_t total_decomp_size = 0; + + struct codec_stats { + parquet::Compression compression_type = UNCOMPRESSED; + size_t num_pages = 0; + int32_t max_decompressed_size = 0; + size_t total_decomp_size = 0; + }; + + std::array codecs{codec_stats{parquet::GZIP}, + codec_stats{parquet::SNAPPY}, + codec_stats{parquet::BROTLI}, + codec_stats{parquet::ZSTD}}; + + auto is_codec_supported = [&codecs](int8_t codec) { + if (codec == parquet::UNCOMPRESSED) return true; + return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { + return codec == cstats.compression_type; + }) != codecs.end(); + }; + CUDF_EXPECTS(std::all_of(chunks.begin(), + chunks.end(), + [&is_codec_supported](auto const& chunk) { + return is_codec_supported(chunk.codec); + }), + "Unsupported compression type"); + + for (auto& codec : codecs) { + for_each_codec_page(codec.compression_type, [&](size_t page) { + auto page_uncomp_size = pages[page].uncompressed_page_size; + total_decomp_size += page_uncomp_size; + codec.total_decomp_size += page_uncomp_size; + codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size); + codec.num_pages++; + num_comp_pages++; + }); + if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) { + debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream); + } + } + + // Dispatch batches of pages to decompress for each codec + rmm::device_buffer decomp_pages(total_decomp_size, stream); + + std::vector> comp_in; + comp_in.reserve(num_comp_pages); + std::vector> comp_out; + comp_out.reserve(num_comp_pages); + + // vectors to save v2 def and rep level data, if any + std::vector> copy_in; + copy_in.reserve(num_comp_pages); + std::vector> copy_out; + copy_out.reserve(num_comp_pages); + + rmm::device_uvector comp_res(num_comp_pages, stream); + thrust::fill(rmm::exec_policy(stream), + comp_res.begin(), + comp_res.end(), + compression_result{0, compression_status::FAILURE}); + + size_t decomp_offset = 0; + int32_t start_pos = 0; + for (const auto& codec : codecs) { + if (codec.num_pages == 0) { continue; } + + for_each_codec_page(codec.compression_type, [&](size_t page_idx) { + auto const dst_base = static_cast(decomp_pages.data()) + decomp_offset; + auto& page = pages[page_idx]; + // offset will only be non-zero for V2 pages + auto const offset = page.def_lvl_bytes + page.rep_lvl_bytes; + // for V2 need to copy def and rep level info into place, and then offset the + // input and output buffers. otherwise we'd have to keep both the compressed + // and decompressed data. + if (offset != 0) { + copy_in.emplace_back(page.page_data, offset); + copy_out.emplace_back(dst_base, offset); + } + comp_in.emplace_back(page.page_data + offset, + static_cast(page.compressed_page_size - offset)); + comp_out.emplace_back(dst_base + offset, + static_cast(page.uncompressed_page_size - offset)); + page.page_data = dst_base; + decomp_offset += page.uncompressed_page_size; + }); + + host_span const> comp_in_view{comp_in.data() + start_pos, + codec.num_pages}; + auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream); + host_span const> comp_out_view(comp_out.data() + start_pos, + codec.num_pages); + auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream); + device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); + + switch (codec.compression_type) { + case parquet::GZIP: + gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); + break; + case parquet::SNAPPY: + if (nvcomp_integration::is_stable_enabled()) { + nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, + d_comp_in, + d_comp_out, + d_comp_res_view, + codec.max_decompressed_size, + codec.total_decomp_size, + stream); + } else { + gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); + } + break; + case parquet::ZSTD: + nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, + d_comp_in, + d_comp_out, + d_comp_res_view, + codec.max_decompressed_size, + codec.total_decomp_size, + stream); + break; + case parquet::BROTLI: + gpu_debrotli(d_comp_in, + d_comp_out, + d_comp_res_view, + debrotli_scratch.data(), + debrotli_scratch.size(), + stream); + break; + default: CUDF_FAIL("Unexpected decompression dispatch"); break; + } + start_pos += codec.num_pages; + } + + CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), + comp_res.begin(), + comp_res.end(), + [] __device__(auto const& res) { + return res.status == compression_status::SUCCESS; + }), + "Error during decompression"); + + // now copy the uncompressed V2 def and rep level data + if (not copy_in.empty()) { + auto const d_copy_in = cudf::detail::make_device_uvector_async(copy_in, stream); + auto const d_copy_out = cudf::detail::make_device_uvector_async(copy_out, stream); + + gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream); + stream.synchronize(); + } + + // Update the page information in device memory with the updated value of + // page_data; it now points to the uncompressed data buffer + pages.host_to_device(stream); + + return decomp_pages; +} + +} // namespace + +void reader::impl::allocate_nesting_info() +{ + auto const& chunks = _file_itm_data.chunks; + auto& pages = _file_itm_data.pages_info; + auto& page_nesting_info = _file_itm_data.page_nesting_info; + + // compute total # of page_nesting infos needed and allocate space. doing this in one + // buffer to keep it to a single gpu allocation + size_t const total_page_nesting_infos = std::accumulate( + chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) { + // the schema of the input column + auto const& schema = _metadata->get_schema(chunk.src_col_schema); + auto const per_page_nesting_info_size = max( + schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema)); + return total + (per_page_nesting_info_size * chunk.num_data_pages); + }); + + page_nesting_info = hostdevice_vector{total_page_nesting_infos, _stream}; + + // retrieve from the gpu so we can update + pages.device_to_host(_stream, true); + + // update pointers in the PageInfos + int target_page_index = 0; + int src_info_index = 0; + for (size_t idx = 0; idx < chunks.size(); idx++) { + int src_col_schema = chunks[idx].src_col_schema; + auto& schema = _metadata->get_schema(src_col_schema); + auto const per_page_nesting_info_size = std::max( + schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema)); + + // skip my dict pages + target_page_index += chunks[idx].num_dict_pages; + for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { + pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index; + pages[target_page_index + p_idx].num_nesting_levels = per_page_nesting_info_size; + + src_info_index += per_page_nesting_info_size; + } + target_page_index += chunks[idx].num_data_pages; + } + + // copy back to the gpu + pages.host_to_device(_stream); + + // fill in + int nesting_info_index = 0; + std::map, std::vector>> depth_remapping; + for (size_t idx = 0; idx < chunks.size(); idx++) { + int src_col_schema = chunks[idx].src_col_schema; + + // schema of the input column + auto& schema = _metadata->get_schema(src_col_schema); + // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc) + int max_depth = _metadata->get_output_nesting_depth(src_col_schema); + + // # of nesting infos stored per page for this column + auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth); + + // if this column has lists, generate depth remapping + std::map, std::vector>> depth_remapping; + if (schema.max_repetition_level > 0) { + generate_depth_remappings(depth_remapping, src_col_schema, *_metadata); + } + + // fill in host-side nesting info + int schema_idx = src_col_schema; + auto cur_schema = _metadata->get_schema(schema_idx); + int cur_depth = max_depth - 1; + while (schema_idx > 0) { + // stub columns (basically the inner field of a list scheme element) are not real columns. + // we can ignore them for the purposes of output nesting info + if (!cur_schema.is_stub()) { + // initialize each page within the chunk + for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { + gpu::PageNestingInfo* pni = + &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; + + // if we have lists, set our start and end depth remappings + if (schema.max_repetition_level > 0) { + auto remap = depth_remapping.find(src_col_schema); + CUDF_EXPECTS(remap != depth_remapping.end(), + "Could not find depth remapping for schema"); + std::vector const& rep_depth_remap = (remap->second.first); + std::vector const& def_depth_remap = (remap->second.second); + + for (size_t m = 0; m < rep_depth_remap.size(); m++) { + pni[m].start_depth = rep_depth_remap[m]; + } + for (size_t m = 0; m < def_depth_remap.size(); m++) { + pni[m].end_depth = def_depth_remap[m]; + } + } + + // values indexed by output column index + pni[cur_depth].max_def_level = cur_schema.max_definition_level; + pni[cur_depth].max_rep_level = cur_schema.max_repetition_level; + pni[cur_depth].size = 0; + pni[cur_depth].type = + to_type_id(cur_schema, _strings_to_categorical, _timestamp_type.id()); + pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL; + } + + // move up the hierarchy + cur_depth--; + } + + // next schema + schema_idx = cur_schema.parent_idx; + cur_schema = _metadata->get_schema(schema_idx); + } + + nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages); + } + + // copy nesting info to the device + page_nesting_info.host_to_device(_stream); +} + +void reader::impl::load_and_decompress_data(std::vector const& row_groups_info, + size_type num_rows) +{ + // This function should never be called if `num_rows == 0`. + CUDF_EXPECTS(num_rows > 0, "Number of reading rows must not be zero."); + + auto& raw_page_data = _file_itm_data.raw_page_data; + auto& decomp_page_data = _file_itm_data.decomp_page_data; + auto& chunks = _file_itm_data.chunks; + auto& pages_info = _file_itm_data.pages_info; + + // Descriptors for all the chunks that make up the selected columns + const auto num_input_columns = _input_columns.size(); + const auto num_chunks = row_groups_info.size() * num_input_columns; + chunks = hostdevice_vector(0, num_chunks, _stream); + + // Association between each column chunk and its source + std::vector chunk_source_map(num_chunks); + + // Tracker for eventually deallocating compressed and uncompressed data + raw_page_data = std::vector>(num_chunks); + + // Keep track of column chunk file offsets + std::vector column_chunk_offsets(num_chunks); + + // Initialize column chunk information + size_t total_decompressed_size = 0; + auto remaining_rows = num_rows; + std::vector> read_rowgroup_tasks; + for (const auto& rg : row_groups_info) { + const auto& row_group = _metadata->get_row_group(rg.index, rg.source_index); + auto const row_group_start = rg.start_row; + auto const row_group_source = rg.source_index; + auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); + auto const io_chunk_idx = chunks.size(); + + // generate ColumnChunkDesc objects for everything to be decoded (all input columns) + for (size_t i = 0; i < num_input_columns; ++i) { + auto col = _input_columns[i]; + // look up metadata + auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); + auto& schema = _metadata->get_schema(col.schema_idx); + + auto [type_width, clock_rate, converted_type] = + conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), + _timestamp_type.id(), + schema.type, + schema.converted_type, + schema.type_length); + + column_chunk_offsets[chunks.size()] = + (col_meta.dictionary_page_offset != 0) + ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset) + : col_meta.data_page_offset; + + chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_scale, + clock_rate, + i, + col.schema_idx)); + + // Map each column chunk to its column index and its source index + chunk_source_map[chunks.size() - 1] = row_group_source; + + if (col_meta.codec != Compression::UNCOMPRESSED) { + total_decompressed_size += col_meta.total_uncompressed_size; + } + } + // Read compressed chunk data to device memory + read_rowgroup_tasks.push_back(read_column_chunks_async(_sources, + raw_page_data, + chunks, + io_chunk_idx, + chunks.size(), + column_chunk_offsets, + chunk_source_map, + _stream)); + + remaining_rows -= row_group.num_rows; + } + for (auto& task : read_rowgroup_tasks) { + task.wait(); + } + + CUDF_EXPECTS(remaining_rows <= 0, "All rows data must be read."); + + // Process dataset chunk pages into output columns + auto const total_pages = count_page_headers(chunks, _stream); + pages_info = hostdevice_vector(total_pages, total_pages, _stream); + + if (total_pages > 0) { + // decoding of column/page information + decode_page_headers(chunks, pages_info, _stream); + if (total_decompressed_size > 0) { + decomp_page_data = decompress_page_data(chunks, pages_info, _stream); + // Free compressed data + for (size_t c = 0; c < chunks.size(); c++) { + if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { + raw_page_data[c].reset(); + // TODO: Check if this is called + } + } + } + + // build output column info + // walk the schema, building out_buffers that mirror what our final cudf columns will look + // like. important : there is not necessarily a 1:1 mapping between input columns and output + // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct + // columns. The "structiness" is simply implied by the schema. For example, this schema: + // required group field_id=1 name { + // required binary field_id=2 firstname (String); + // required binary field_id=3 middlename (String); + // required binary field_id=4 lastname (String); + // } + // will only contain 3 columns of data (firstname, middlename, lastname). But of course + // "name" is a struct column that we want to return, so we have to make sure that we + // create it ourselves. + // std::vector output_info = build_output_column_info(); + + // nesting information (sizes, etc) stored -per page- + // note : even for flat schemas, we allocate 1 level of "nesting" info + allocate_nesting_info(); + } +} + +namespace { + +struct cumulative_row_info { + size_t row_count; // cumulative row count + size_t size_bytes; // cumulative size in bytes + int key; // schema index +}; + +#if defined(PREPROCESS_DEBUG) +void print_pages(hostdevice_vector& pages, rmm::cuda_stream_view _stream) +{ + pages.device_to_host(_stream, true); + for (size_t idx = 0; idx < pages.size(); idx++) { + auto const& p = pages[idx]; + // skip dictionary pages + if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } + printf( + "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d)\n", + idx, + p.src_col_schema, + p.chunk_row, + p.num_rows, + p.skipped_values, + p.skipped_leaf_values); + } +} + +void print_cumulative_page_info(hostdevice_vector& pages, + rmm::device_uvector const& page_index, + rmm::device_uvector const& c_info, + rmm::cuda_stream_view stream) +{ + pages.device_to_host(stream, true); + + printf("------------\nCumulative sizes by page\n"); + + std::vector schemas(pages.size()); + std::vector h_page_index(pages.size()); + cudaMemcpy( + h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDeviceToHost); + std::vector h_cinfo(pages.size()); + cudaMemcpy(h_cinfo.data(), + c_info.data(), + sizeof(cumulative_row_info) * pages.size(), + cudaMemcpyDeviceToHost); + auto schema_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); + thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); + auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); + schemas.resize(last - schemas.begin()); + printf("Num schemas: %lu\n", schemas.size()); + + for (size_t idx = 0; idx < schemas.size(); idx++) { + printf("Schema %d\n", schemas[idx]); + for (size_t pidx = 0; pidx < pages.size(); pidx++) { + auto const& page = pages[h_page_index[pidx]]; + if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + continue; + } + printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); + } + } +} + +void print_cumulative_row_info( + host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) +{ + if (splits.has_value()) { + printf("------------\nSplits\n"); + for (size_t idx = 0; idx < splits->size(); idx++) { + printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); + } + } + + printf("------------\nCumulative sizes %s\n", label.c_str()); + for (size_t idx = 0; idx < sizes.size(); idx++) { + printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); + if (splits.has_value()) { + // if we have a split at this row count and this is the last instance of this row count + auto start = thrust::make_transform_iterator( + splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); + auto end = start + splits->size(); + auto split = std::find(start, end, sizes[idx].row_count); + auto const split_index = [&]() -> int { + if (split != end && + ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { + return static_cast(std::distance(start, split)); + } + return idx == 0 ? 0 : -1; + }(); + if (split_index >= 0) { + printf(" <-- split {%lu, %lu}", + splits.value()[split_index].skip_rows, + splits.value()[split_index].num_rows); + } + } + printf("\n"); + } +} +#endif // PREPROCESS_DEBUG + +/** + * @brief Functor which reduces two cumulative_row_info structs of the same key. + */ +struct cumulative_row_sum { + cumulative_row_info operator() + __device__(cumulative_row_info const& a, cumulative_row_info const& b) const + { + return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; + } +}; + +/** + * @brief Functor which computes the total data size for a given type of cudf column. + * + * In the case of strings, the return size does not include the chars themselves. That + * information is tracked separately (see PageInfo::str_bytes). + */ +struct row_size_functor { + __device__ size_t validity_size(size_t num_rows, bool nullable) + { + return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; + } + + template + __device__ size_t operator()(size_t num_rows, bool nullable) + { + auto const element_size = sizeof(device_storage_type_t); + return (element_size * num_rows) + validity_size(num_rows, nullable); + } +}; + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + auto const offset_size = sizeof(offset_type); + // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset + // for the entire column, whereas this is adding an extra offset per page. So we will get a + // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better + // to overestimate size somewhat than to underestimate it and potentially generate chunks + // that are too large. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + return validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + // only returns the size of offsets and validity. the size of the actual string chars + // is tracked separately. + auto const offset_size = sizeof(offset_type); + // see note about offsets in the list_view template. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +/** + * @brief Functor which computes the total output cudf data size for all of + * the data in this page. + * + * Sums across all nesting levels. + */ +struct get_cumulative_row_info { + gpu::PageInfo const* const pages; + + __device__ cumulative_row_info operator()(size_type index) + { + auto const& page = pages[index]; + if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + return cumulative_row_info{0, 0, page.src_col_schema}; + } + + // total nested size, not counting string data + auto iter = + cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { + auto const& pni = page.nesting[i]; + return cudf::type_dispatcher( + data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); + }); + + size_t const row_count = static_cast(page.nesting[0].size); + return {row_count, + thrust::reduce(thrust::seq, iter, iter + page.num_nesting_levels) + page.str_bytes, + page.src_col_schema}; + } +}; + +/** + * @brief Functor which computes the effective size of all input columns by page. + * + * For a given row, we want to find the cost of all pages for all columns involved + * in loading up to that row. The complication here is that not all pages are the + * same size between columns. Example: + * + * page row counts + * Column A: 0 <----> 100 <----> 200 + * Column B: 0 <---------------> 200 <--------> 400 + | + * if we decide to split at row 100, we don't really know the actual amount of bytes in column B + * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + * page. Essentially, a conservative over-estimate of the real size. + */ +struct row_total_size { + cumulative_row_info const* c_info; + size_type const* key_offsets; + size_t num_keys; + + __device__ cumulative_row_info operator()(cumulative_row_info const& i) + { + // sum sizes for each input column at this row + size_t sum = 0; + for (int idx = 0; idx < num_keys; idx++) { + auto const start = key_offsets[idx]; + auto const end = key_offsets[idx + 1]; + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&] __device__(size_type i) { return c_info[i].row_count; }); + auto const page_index = + thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; + sum += c_info[page_index].size_bytes; + } + return {i.row_count, sum, i.key}; + } +}; + +/** + * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read + * limit, determine the set of splits. + * + * @param sizes Vector of cumulative {row_count, byte_size} pairs + * @param num_rows Total number of rows to read + * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns + */ +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) +{ + // now we have an array of {row_count, real output bytes}. just walk through it and generate + // splits. + // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch + // sizes are reasonably large, this shouldn't iterate too many times + std::vector splits; + { + size_t cur_pos = 0; + size_t cur_cumulative_size = 0; + size_t cur_row_count = 0; + auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { + return i.size_bytes - cur_cumulative_size; + }); + auto end = start + sizes.size(); + while (cur_row_count < num_rows) { + int64_t split_pos = + thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; + + // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back + // one. + if (static_cast(split_pos) >= sizes.size() || + (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { + split_pos--; + } + + // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in + // a loop because all of the cumulative sizes for all the pages are sorted into one big list. + // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in + // the list twice. so we have to iterate until we skip past all of them. The idea is that we + // either do this, or we have to call unique() on the input first. + while (split_pos < (static_cast(sizes.size()) - 1) && + (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { + split_pos++; + } + + auto const start_row = cur_row_count; + cur_row_count = sizes[split_pos].row_count; + splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); + cur_pos = split_pos; + cur_cumulative_size = sizes[split_pos].size_bytes; + } + } + // print_cumulative_row_info(sizes, "adjusted", splits); + + return splits; +} + +/** + * @brief Given a set of pages that have had their sizes computed by nesting level and + * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing + * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. + * + * @param pages All pages in the file + * @param id Additional intermediate information required to process the pages + * @param num_rows Total number of rows to read + * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns + * @param stream CUDA stream to use, default 0 + */ +std::vector compute_splits(hostdevice_vector& pages, + gpu::chunk_intermediate_data const& id, + size_t num_rows, + size_t chunk_read_limit, + rmm::cuda_stream_view stream) +{ + auto const& page_keys = id.page_keys; + auto const& page_index = id.page_index; + + // generate cumulative row counts and sizes + rmm::device_uvector c_info(page_keys.size(), stream); + // convert PageInfo to cumulative_row_info + auto page_input = thrust::make_transform_iterator(page_index.begin(), + get_cumulative_row_info{pages.device_ptr()}); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + page_keys.begin(), + page_keys.end(), + page_input, + c_info.begin(), + thrust::equal_to{}, + cumulative_row_sum{}); + // print_cumulative_page_info(pages, page_index, c_info, stream); + + // sort by row count + rmm::device_uvector c_info_sorted{c_info, stream}; + thrust::sort(rmm::exec_policy(stream), + c_info_sorted.begin(), + c_info_sorted.end(), + [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) { + return a.row_count < b.row_count; + }); + + std::vector h_c_info_sorted(c_info_sorted.size()); + cudaMemcpy(h_c_info_sorted.data(), + c_info_sorted.data(), + sizeof(cumulative_row_info) * c_info_sorted.size(), + cudaMemcpyDeviceToHost); + // print_cumulative_row_info(h_c_info_sorted, "raw"); + + // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per + // key + rmm::device_uvector key_offsets(page_keys.size() + 1, stream); + auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream), + page_keys.begin(), + page_keys.end(), + thrust::make_constant_iterator(1), + thrust::make_discard_iterator(), + key_offsets.begin()) + .second; + size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); + thrust::exclusive_scan( + rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); + + // adjust the cumulative info such that for each row count, the size includes any pages that span + // that row count. this is so that if we have this case: + // page row counts + // Column A: 0 <----> 100 <----> 200 + // Column B: 0 <---------------> 200 <--------> 400 + // | + // if we decide to split at row 100, we don't really know the actual amount of bytes in column B + // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + // page. + // + rmm::device_uvector aggregated_info(c_info.size(), stream); + thrust::transform(rmm::exec_policy(stream), + c_info_sorted.begin(), + c_info_sorted.end(), + aggregated_info.begin(), + row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); + + // bring back to the cpu + std::vector h_aggregated_info(aggregated_info.size()); + cudaMemcpyAsync(h_aggregated_info.data(), + aggregated_info.data(), + sizeof(cumulative_row_info) * c_info.size(), + cudaMemcpyDeviceToHost, + stream); + stream.synchronize(); + + return find_splits(h_aggregated_info, num_rows, chunk_read_limit); +} + +struct get_page_chunk_idx { + __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; } +}; + +struct get_page_num_rows { + __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; } +}; + +struct get_page_schema { + __device__ size_type operator()(gpu::PageInfo const& page) { return page.src_col_schema; } +}; + +/** + * @brief Returns the size field of a PageInfo struct for a given depth, keyed by schema. + */ +struct get_page_nesting_size { + size_type const src_col_schema; + size_type const depth; + gpu::PageInfo const* const pages; + + __device__ size_type operator()(int index) const + { + auto const& page = pages[index]; + if (page.src_col_schema != src_col_schema || page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + return 0; + } + return page.nesting[depth].batch_size; + } +}; + +/** + * @brief Writes to the chunk_row field of the PageInfo struct. + */ +struct chunk_row_output_iter { + gpu::PageInfo* p; + using value_type = size_type; + using difference_type = size_type; + using pointer = size_type*; + using reference = size_type&; + using iterator_category = thrust::output_device_iterator_tag; + + __host__ __device__ chunk_row_output_iter operator+(int i) + { + return chunk_row_output_iter{p + i}; + } + + __host__ __device__ void operator++() { p++; } + + __device__ reference operator[](int i) { return p[i].chunk_row; } + __device__ reference operator*() { return p->chunk_row; } +}; + +/** + * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema. + */ +struct start_offset_output_iterator { + gpu::PageInfo* pages; + int const* page_indices; + int cur_index; + int src_col_schema; + int nesting_depth; + int empty = 0; + using value_type = size_type; + using difference_type = size_type; + using pointer = size_type*; + using reference = size_type&; + using iterator_category = thrust::output_device_iterator_tag; + + constexpr void operator=(start_offset_output_iterator const& other) + { + pages = other.pages; + page_indices = other.page_indices; + cur_index = other.cur_index; + src_col_schema = other.src_col_schema; + nesting_depth = other.nesting_depth; + } + + constexpr start_offset_output_iterator operator+(int i) + { + return start_offset_output_iterator{ + pages, page_indices, cur_index + i, src_col_schema, nesting_depth}; + } + + constexpr void operator++() { cur_index++; } + + __device__ reference operator[](int i) { return dereference(cur_index + i); } + __device__ reference operator*() { return dereference(cur_index); } + + private: + __device__ reference dereference(int index) + { + gpu::PageInfo const& p = pages[page_indices[index]]; + if (p.src_col_schema != src_col_schema || p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + return empty; + } + return p.nesting[nesting_depth].page_start_value; + } +}; + +} // anonymous namespace + +void reader::impl::preprocess_pages(size_t skip_rows, + size_t num_rows, + bool uses_custom_row_bounds, + size_t chunk_read_limit) +{ + auto& chunks = _file_itm_data.chunks; + auto& pages = _file_itm_data.pages_info; + + // iterate over all input columns and determine if they contain lists so we can further + // preprocess them. + bool has_lists = false; + for (size_t idx = 0; idx < _input_columns.size(); idx++) { + auto const& input_col = _input_columns[idx]; + size_t const max_depth = input_col.nesting_depth(); + + auto* cols = &_output_buffers; + for (size_t l_idx = 0; l_idx < max_depth; l_idx++) { + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; + cols = &out_buf.children; + + // if this has a list parent, we have to get column sizes from the + // data computed during gpu::ComputePageSizes + if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { + has_lists = true; + break; + } + } + if (has_lists) { break; } + } + + // generate string dict indices if necessary + { + auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) { + return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; + }; + + // Count the number of string dictionary entries + // NOTE: Assumes first page in the chunk is always the dictionary page + size_t total_str_dict_indexes = 0; + for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { + if (is_dict_chunk(chunks[c])) { + total_str_dict_indexes += pages[page_count].num_input_values; + } + page_count += chunks[c].max_num_pages; + } + + // Build index for string dictionaries since they can't be indexed + // directly due to variable-sized elements + _chunk_itm_data.str_dict_index = + cudf::detail::make_zeroed_device_uvector_async(total_str_dict_indexes, + _stream); + + // Update chunks with pointers to string dict indices + for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) { + input_column_info const& input_col = _input_columns[chunks[c].src_col_index]; + CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema, + "Column/page schema index mismatch"); + if (is_dict_chunk(chunks[c])) { + chunks[c].str_dict_index = _chunk_itm_data.str_dict_index.data() + str_ofs; + str_ofs += pages[page_count].num_input_values; + } + + // column_data_base will always point to leaf data, even for nested types. + page_count += chunks[c].max_num_pages; + } + + if (total_str_dict_indexes > 0) { + chunks.host_to_device(_stream); + gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); + } + } + + // intermediate data we will need for further chunked reads + if (has_lists || chunk_read_limit > 0) { + // computes: + // PageNestingInfo::num_rows for each page. the true number of rows (taking repetition into + // account), not just the number of values. PageNestingInfo::size for each level of nesting, for + // each page. + // + // we will be applying a later "trim" pass if skip_rows/num_rows is being used, which can happen + // if: + // - user has passed custom row bounds + // - we will be doing a chunked read + gpu::ComputePageSizes(pages, + chunks, + 0, // 0-max size_t. process all possible rows + std::numeric_limits::max(), + true, // compute num_rows + chunk_read_limit > 0, // compute string sizes + _stream); + + // computes: + // PageInfo::chunk_row (the absolute start row index) for all pages + // Note: this is doing some redundant work for pages in flat hierarchies. chunk_row has already + // been computed during header decoding. the overall amount of work here is very small though. + auto key_input = thrust::make_transform_iterator(pages.device_ptr(), get_page_chunk_idx{}); + auto page_input = thrust::make_transform_iterator(pages.device_ptr(), get_page_num_rows{}); + thrust::exclusive_scan_by_key(rmm::exec_policy(_stream), + key_input, + key_input + pages.size(), + page_input, + chunk_row_output_iter{pages.device_ptr()}); + + // compute page ordering. + // + // ordering of pages is by input column schema, repeated across row groups. so + // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like + // + // 1, 1, 2, 2, 3, 3 + // + // However, if we had more than one row group, the pattern would be + // + // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3 + // ^ row group 0 | + // ^ row group 1 + // + // To use exclusive_scan_by_key, the ordering we actually want is + // + // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 + // + // We also need to preserve key-relative page ordering, so we need to use a stable sort. + _chunk_itm_data.page_keys = rmm::device_uvector(pages.size(), _stream); + _chunk_itm_data.page_index = rmm::device_uvector(pages.size(), _stream); + auto& page_keys = _chunk_itm_data.page_keys; + auto& page_index = _chunk_itm_data.page_index; + { + thrust::transform(rmm::exec_policy(_stream), + pages.device_ptr(), + pages.device_ptr() + pages.size(), + page_keys.begin(), + get_page_schema{}); + + thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end()); + thrust::stable_sort_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + page_index.begin(), + thrust::less()); + } + + // retrieve pages back + pages.device_to_host(_stream, true); + +#if defined(PREPROCESS_DEBUG) + print_pages(pages, _stream); +#endif + } + + // compute splits if necessary. otherwise retun a single split representing + // the whole file. + _chunk_read_info = chunk_read_limit > 0 + ? compute_splits(pages, _chunk_itm_data, num_rows, chunk_read_limit, _stream) + : std::vector{{skip_rows, num_rows}}; +} + +void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) +{ + auto const& chunks = _file_itm_data.chunks; + auto& pages = _file_itm_data.pages_info; + + // Should not reach here if there is no page data. + CUDF_EXPECTS(pages.size() > 0, "There is no page to parse"); + + // computes: + // PageNestingInfo::batch_size for each level of nesting, for each page, taking row bounds into + // account. PageInfo::skipped_values, which tells us where to start decoding in the input to + // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds + // is set (if the user has specified artifical bounds). + if (uses_custom_row_bounds) { + gpu::ComputePageSizes(pages, + chunks, + skip_rows, + num_rows, + false, // num_rows is already computed + false, // no need to compute string sizes + _stream); +#if defined(PREPROCESS_DEBUG) + print_pages(pages, _stream); +#endif + } + + // iterate over all input columns and allocate any associated output + // buffers if they are not part of a list hierarchy. mark down + // if we have any list columns that need further processing. + bool has_lists = false; + for (size_t idx = 0; idx < _input_columns.size(); idx++) { + auto const& input_col = _input_columns[idx]; + size_t const max_depth = input_col.nesting_depth(); + + auto* cols = &_output_buffers; + for (size_t l_idx = 0; l_idx < max_depth; l_idx++) { + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; + cols = &out_buf.children; + + // if this has a list parent, we have to get column sizes from the + // data computed during gpu::ComputePageSizes + if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { + has_lists = true; + } + // if we haven't already processed this column because it is part of a struct hierarchy + else if (out_buf.size == 0) { + // add 1 for the offset if this is a list column + out_buf.create( + out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows, + _stream, + _mr); + } + } + } + + // compute output column sizes by examining the pages of the -input- columns + if (has_lists) { + auto& page_keys = _chunk_itm_data.page_keys; + auto& page_index = _chunk_itm_data.page_index; + for (size_t idx = 0; idx < _input_columns.size(); idx++) { + auto const& input_col = _input_columns[idx]; + auto src_col_schema = input_col.schema_idx; + size_t max_depth = input_col.nesting_depth(); + + auto* cols = &_output_buffers; + for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { + auto& out_buf = (*cols)[input_col.nesting[l_idx]]; + cols = &out_buf.children; + + // size iterator. indexes pages by sorted order + auto size_input = thrust::make_transform_iterator( + page_index.begin(), + get_page_nesting_size{src_col_schema, static_cast(l_idx), pages.device_ptr()}); + + // if this buffer is part of a list hierarchy, we need to determine it's + // final size and allocate it here. + // + // for struct columns, higher levels of the output columns are shared between input + // columns. so don't compute any given level more than once. + if ((out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) && out_buf.size == 0) { + int size = + thrust::reduce(rmm::exec_policy(_stream), size_input, size_input + pages.size()); + + // if this is a list column add 1 for non-leaf levels for the terminating offset + if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; } + + // allocate + out_buf.create(size, _stream, _mr); + } + + // for nested hierarchies, compute per-page start offset + if (input_col.has_repetition) { + thrust::exclusive_scan_by_key( + rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + size_input, + start_offset_output_iterator{pages.device_ptr(), + page_index.begin(), + 0, + static_cast(src_col_schema), + static_cast(l_idx)}); + } + } + } + } +} + +} // namespace cudf::io::detail::parquet diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 9514b053451..26b3f97616f 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -19,6 +19,7 @@ * @brief cuDF-IO parquet writer class implementation */ +#include "parquet_gpu.cuh" #include "writer_impl.hpp" #include "compact_protocol_reader.hpp" @@ -374,44 +375,53 @@ struct leaf_schema_fn { template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT32; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 24 * 60 * 60 * 1000; + col_schema.logical_type.isset.TIME = true; + col_schema.logical_type.TIME.unit.isset.MILLIS = true; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.ts_scale = 1000; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.ts_scale = 1000; + col_schema.logical_type.isset.TIME = true; + col_schema.logical_type.TIME.unit.isset.MILLIS = true; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MILLIS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.type = Type::INT32; + col_schema.converted_type = ConvertedType::TIME_MILLIS; + col_schema.stats_dtype = statistics_dtype::dtype_int32; + col_schema.logical_type.isset.TIME = true; + col_schema.logical_type.TIME.unit.isset.MILLIS = true; } template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MICROS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.type = Type::INT64; + col_schema.converted_type = ConvertedType::TIME_MICROS; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type.isset.TIME = true; + col_schema.logical_type.TIME.unit.isset.MICROS = true; } // unsupported outside cudf for parquet 1.0. template std::enable_if_t, void> operator()() { - col_schema.type = Type::INT64; - col_schema.converted_type = ConvertedType::TIME_MICROS; - col_schema.stats_dtype = statistics_dtype::dtype_int64; - col_schema.ts_scale = -1000; // negative value indicates division by absolute value + col_schema.type = Type::INT64; + col_schema.stats_dtype = statistics_dtype::dtype_int64; + col_schema.logical_type.isset.TIME = true; + col_schema.logical_type.TIME.unit.isset.NANOS = true; } template @@ -511,7 +521,7 @@ std::vector construct_schema_tree( if (col->type().id() != type_id::LIST) { return false; } auto const child_col_type = col->children[lists_column_view::child_column_index]->type().id(); - return child_col_type == type_id::INT8 or child_col_type == type_id::UINT8; + return child_col_type == type_id::UINT8; }; // There is a special case for a list column with one byte column child. This column can @@ -917,7 +927,7 @@ auto to_nvcomp_compression_type(Compression codec) auto page_alignment(Compression codec) { if (codec == Compression::UNCOMPRESSED or - not nvcomp::is_compression_enabled(to_nvcomp_compression_type(codec))) { + nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) { return 1u; } @@ -1162,19 +1172,22 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks gpu::EncodePages(batch_pages, comp_in, comp_out, comp_res, stream); switch (compression_) { case parquet::Compression::SNAPPY: - if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) { + if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { + gpu_snap(comp_in, comp_out, comp_res, stream); + } else { nvcomp::batched_compress( nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); - } else { - gpu_snap(comp_in, comp_out, comp_res, stream); } break; - case parquet::Compression::ZSTD: - if (nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) { - nvcomp::batched_compress( - nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); + case parquet::Compression::ZSTD: { + if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); + reason) { + CUDF_FAIL("Compression error: " + reason.value()); } + nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); + break; + } case parquet::Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } @@ -1236,9 +1249,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; } auto const ncomp_type = to_nvcomp_compression_type(compression); - auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type) - ? nvcomp::compress_max_allowed_chunk_size(ncomp_type) - : std::nullopt; + auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) + ? std::nullopt + : nvcomp::compress_max_allowed_chunk_size(ncomp_type); return std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes); } diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh index 10a7518aefa..9fc30c625aa 100644 --- a/cpp/src/io/statistics/statistics_type_identification.cuh +++ b/cpp/src/io/statistics/statistics_type_identification.cuh @@ -74,8 +74,7 @@ struct conversion_map { template <> struct conversion_map { using types = std::tuple, - std::pair, - std::pair>; + std::pair>; }; /** diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index 7715c2ca7e1..3fa68cd8b0f 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -19,7 +19,9 @@ #include "io/utilities/config_utils.hpp" #include +#include #include +#include #include #include @@ -29,14 +31,12 @@ #include #include -#include #include #include #include namespace cudf::io::text { - namespace { /** @@ -64,71 +64,8 @@ struct bgzip_nvcomp_transform_functor { class bgzip_data_chunk_reader : public data_chunk_reader { private: - template - static IntType read_int(char* data) - { - IntType result{}; - // we assume little-endian - std::memcpy(&result, &data[0], sizeof(result)); - return result; - } - - struct bgzip_header { - int block_size; - int extra_length; - [[nodiscard]] int data_size() const { return block_size - extra_length - 20; } - }; - - bgzip_header read_header() - { - std::array buffer{}; - _data_stream->read(buffer.data(), sizeof(buffer)); - std::array const expected_header{{31, 139, 8, 4}}; - CUDF_EXPECTS( - std::equal( - expected_header.begin(), expected_header.end(), reinterpret_cast(buffer.data())), - "malformed BGZIP header"); - // we ignore the remaining bytes of the fixed header, since they don't matter to us - auto const extra_length = read_int(&buffer[10]); - uint16_t extra_offset{}; - // read all the extra subfields - while (extra_offset < extra_length) { - auto const remaining_size = extra_length - extra_offset; - CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length"); - // a subfield consists of 2 identifier bytes and a uint16 length - // 66/67 identifies a BGZIP block size field, we skip all other fields - _data_stream->read(buffer.data(), 4); - extra_offset += 4; - auto const subfield_size = read_int(&buffer[2]); - if (buffer[0] == 66 && buffer[1] == 67) { - // the block size subfield contains a single uint16 value, which is block_size - 1 - CUDF_EXPECTS(subfield_size == sizeof(uint16_t), "malformed BGZIP extra subfield"); - _data_stream->read(buffer.data(), sizeof(uint16_t)); - _data_stream->seekg(remaining_size - 6, std::ios_base::cur); - auto const block_size_minus_one = read_int(&buffer[0]); - return {block_size_minus_one + 1, extra_length}; - } else { - _data_stream->seekg(subfield_size, std::ios_base::cur); - extra_offset += subfield_size; - } - } - CUDF_FAIL("missing BGZIP size extra subfield"); - } - - struct bgzip_footer { - uint32_t decompressed_size; - }; - - bgzip_footer read_footer() - { - std::array buffer{}; - _data_stream->read(buffer.data(), sizeof(buffer)); - return {read_int(&buffer[4])}; - } - template - using pinned_host_vector = - thrust::host_vector>; + using pinned_host_vector = thrust::host_vector>; template static void copy_to_device(const pinned_host_vector& host, @@ -207,7 +144,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader { bgzip_nvcomp_transform_functor{reinterpret_cast(d_compressed_blocks.data()), reinterpret_cast(d_decompressed_blocks.begin())}); if (decompressed_size() > 0) { - if (cudf::io::detail::nvcomp_integration::is_all_enabled()) { + if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) { + gpuinflate(d_compressed_spans, + d_decompressed_spans, + d_decompression_results, + gzip_header_included::NO, + stream); + } else { cudf::io::nvcomp::batched_decompress(cudf::io::nvcomp::compression_type::DEFLATE, d_compressed_spans, d_decompressed_spans, @@ -215,12 +158,6 @@ class bgzip_data_chunk_reader : public data_chunk_reader { max_decompressed_size, decompressed_size(), stream); - } else { - gpuinflate(d_compressed_spans, - d_decompressed_spans, - d_decompression_results, - gzip_header_included::NO, - stream); } } is_decompressed = true; @@ -232,7 +169,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader { h_compressed_offsets.resize(1); h_decompressed_offsets.resize(1); // shrinking doesn't allocate/free, so we don't need to worry about streams - auto stream = cudf::default_stream_value; + auto stream = cudf::get_default_stream(); d_compressed_blocks.resize(0, stream); d_decompressed_blocks.resize(0, stream); d_compressed_offsets.resize(0, stream); @@ -258,13 +195,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader { return available_decompressed_size - read_pos; } - void read_block(bgzip_header header, std::istream& stream) + void read_block(detail::bgzip::header header, std::istream& stream) { h_compressed_blocks.resize(h_compressed_blocks.size() + header.data_size()); stream.read(h_compressed_blocks.data() + compressed_size(), header.data_size()); } - void add_block_offsets(bgzip_header header, bgzip_footer footer) + void add_block_offsets(detail::bgzip::header header, detail::bgzip::footer footer) { max_decompressed_size = std::max(footer.decompressed_size, max_decompressed_size); @@ -294,9 +231,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader { // peek is necessary if we are already at the end, but didn't try to read another byte _data_stream->peek(); if (_data_stream->eof() || _compressed_pos > _compressed_end) { break; } - auto header = read_header(); + auto header = detail::bgzip::read_header(*_data_stream); _curr_blocks.read_block(header, *_data_stream); - auto footer = read_footer(); + auto footer = detail::bgzip::read_footer(*_data_stream); _curr_blocks.add_block_offsets(header, footer); // for the last GZIP block, we restrict ourselves to the bytes up to _local_end // but only for the reader, not for decompression! @@ -318,8 +255,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader { uint64_t virtual_begin, uint64_t virtual_end) : _data_stream(std::move(input_stream)), - _prev_blocks{cudf::default_stream_value}, // here we can use the default stream because - _curr_blocks{cudf::default_stream_value}, // we only initialize empty device_uvectors + _prev_blocks{cudf::get_default_stream()}, // here we can use the default stream because + _curr_blocks{cudf::get_default_stream()}, // we only initialize empty device_uvectors _local_end{virtual_end & 0xFFFFu}, _compressed_pos{virtual_begin >> 16}, _compressed_end{virtual_end >> 16} @@ -333,8 +270,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader { // seek to the beginning of the provided local offset auto const local_pos = virtual_begin & 0xFFFFu; if (local_pos > 0) { - CUDF_EXPECTS(_curr_blocks.h_compressed_offsets.size() > 1 && - local_pos < _curr_blocks.h_compressed_offsets[1], + CUDF_EXPECTS(_curr_blocks.h_decompressed_offsets.size() > 1 && + local_pos < _curr_blocks.h_decompressed_offsets[1], "local part of virtual offset is out of bounds"); _curr_blocks.consume_bytes(local_pos); } diff --git a/cpp/src/io/text/bgzip_utils.cpp b/cpp/src/io/text/bgzip_utils.cpp new file mode 100644 index 00000000000..dd08387a6b5 --- /dev/null +++ b/cpp/src/io/text/bgzip_utils.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::io::text::detail::bgzip { +namespace { + +template +IntType read_int(char* data) +{ + IntType result{}; + // we assume little-endian + std::memcpy(&result, &data[0], sizeof(result)); + return result; +} + +template +void write_int(std::ostream& output_stream, T val) +{ + std::array bytes; + // we assume little-endian + std::memcpy(&bytes[0], &val, sizeof(T)); + output_stream.write(bytes.data(), bytes.size()); +} + +} // namespace + +std::array constexpr extra_blocklen_field_header{{66, 67, 2, 0}}; + +header read_header(std::istream& input_stream) +{ + std::array buffer{}; + input_stream.read(buffer.data(), sizeof(buffer)); + std::array constexpr expected_header{{31, 139, 8, 4}}; + CUDF_EXPECTS( + std::equal( + expected_header.begin(), expected_header.end(), reinterpret_cast(buffer.data())), + "malformed BGZIP header"); + // we ignore the remaining bytes of the fixed header, since they don't matter to us + auto const extra_length = read_int(&buffer[10]); + uint16_t extra_offset{}; + // read all the extra subfields + while (extra_offset < extra_length) { + auto const remaining_size = extra_length - extra_offset; + CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length"); + // a subfield consists of 2 identifier bytes and a uint16 length + // 66/67 identifies a BGZIP block size field, we skip all other fields + input_stream.read(buffer.data(), 4); + extra_offset += 4; + auto const subfield_size = read_int(&buffer[2]); + if (buffer[0] == extra_blocklen_field_header[0] && + buffer[1] == extra_blocklen_field_header[1]) { + // the block size subfield contains a single uint16 value, which is block_size - 1 + CUDF_EXPECTS( + buffer[2] == extra_blocklen_field_header[2] && buffer[3] == extra_blocklen_field_header[3], + "malformed BGZIP extra subfield"); + input_stream.read(buffer.data(), sizeof(uint16_t)); + input_stream.seekg(remaining_size - 6, std::ios_base::cur); + auto const block_size_minus_one = read_int(&buffer[0]); + return {block_size_minus_one + 1, extra_length}; + } else { + input_stream.seekg(subfield_size, std::ios_base::cur); + extra_offset += subfield_size; + } + } + CUDF_FAIL("missing BGZIP size extra subfield"); +} + +footer read_footer(std::istream& input_stream) +{ + std::array buffer{}; + input_stream.read(buffer.data(), sizeof(buffer)); + return {read_int(&buffer[0]), read_int(&buffer[4])}; +} + +void write_footer(std::ostream& output_stream, host_span data) +{ + // compute crc32 with zlib, this allows checking the generated files with external tools + write_int(output_stream, crc32(0, (unsigned char*)data.data(), data.size())); + write_int(output_stream, data.size()); +} + +void write_header(std::ostream& output_stream, + uint16_t compressed_size, + host_span pre_size_subfield, + host_span post_size_subfield) +{ + std::array constexpr header_data{{ + 31, // magic number + 139, // magic number + 8, // compression type: deflate + 4, // flags: extra header + 0, // mtime + 0, // mtime + 0, // mtime + 0, // mtime: irrelevant + 4, // xfl: irrelevant + 3 // OS: irrelevant + }}; + output_stream.write(reinterpret_cast(header_data.data()), header_data.size()); + auto const extra_size = pre_size_subfield.size() + extra_blocklen_field_header.size() + + sizeof(uint16_t) + post_size_subfield.size(); + auto const block_size = + header_data.size() + sizeof(uint16_t) + extra_size + compressed_size + 2 * sizeof(uint32_t); + write_int(output_stream, extra_size); + output_stream.write(pre_size_subfield.data(), pre_size_subfield.size()); + output_stream.write(extra_blocklen_field_header.data(), extra_blocklen_field_header.size()); + CUDF_EXPECTS(block_size - 1 <= std::numeric_limits::max(), "block size overflow"); + write_int(output_stream, block_size - 1); + output_stream.write(post_size_subfield.data(), post_size_subfield.size()); +} + +void write_uncompressed_block(std::ostream& output_stream, + host_span data, + host_span pre_size_subfields, + host_span post_size_subfields) +{ + CUDF_EXPECTS(data.size() <= std::numeric_limits::max(), "data size overflow"); + write_header(output_stream, data.size() + 5, pre_size_subfields, post_size_subfields); + write_int(output_stream, 1); + write_int(output_stream, data.size()); + write_int(output_stream, ~static_cast(data.size())); + output_stream.write(data.data(), data.size()); + write_footer(output_stream, data); +} + +void write_compressed_block(std::ostream& output_stream, + host_span data, + host_span pre_size_subfields, + host_span post_size_subfields) +{ + CUDF_EXPECTS(data.size() <= std::numeric_limits::max(), "data size overflow"); + z_stream deflate_stream{}; + // let's make sure we have enough space to store the data + std::vector compressed_out(data.size() * 2 + 256); + deflate_stream.next_in = reinterpret_cast(const_cast(data.data())); + deflate_stream.avail_in = data.size(); + deflate_stream.next_out = reinterpret_cast(compressed_out.data()); + deflate_stream.avail_out = compressed_out.size(); + CUDF_EXPECTS( + deflateInit2(&deflate_stream, // stream + Z_DEFAULT_COMPRESSION, // compression level + Z_DEFLATED, // method + -15, // log2 of window size (negative value means no ZLIB header/footer) + 9, // mem level: best performance/most memory usage for compression + Z_DEFAULT_STRATEGY // strategy + ) == Z_OK, + "deflateInit failed"); + CUDF_EXPECTS(deflate(&deflate_stream, Z_FINISH) == Z_STREAM_END, "deflate failed"); + CUDF_EXPECTS(deflateEnd(&deflate_stream) == Z_OK, "deflateEnd failed"); + write_header(output_stream, deflate_stream.total_out, pre_size_subfields, post_size_subfields); + output_stream.write(compressed_out.data(), deflate_stream.total_out); + write_footer(output_stream, data); +} + +} // namespace cudf::io::text::detail::bgzip diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 9a549951d66..c09e7be507f 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -17,12 +17,12 @@ #include "io/text/device_data_chunks.hpp" #include +#include #include #include #include -#include #include @@ -30,6 +30,86 @@ namespace cudf::io::text { namespace { +/** + * @brief A reader which produces owning chunks of device memory which contain a copy of the data + * from an istream. + */ +class datasource_chunk_reader : public data_chunk_reader { + struct host_ticket { + cudaEvent_t event; + thrust::host_vector> buffer; + }; + + constexpr static int num_tickets = 2; + + public: + datasource_chunk_reader(datasource* source) : _source(source) + { + // create an event to track the completion of the last device-to-host copy. + for (auto& ticket : _tickets) { + CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event))); + } + } + + ~datasource_chunk_reader() override + { + for (auto& ticket : _tickets) { + CUDF_CUDA_TRY(cudaEventDestroy(ticket.event)); + } + } + + void skip_bytes(std::size_t size) override + { + _offset += std::min(_source->size() - _offset, size); + }; + + std::unique_ptr get_next_chunk(std::size_t read_size, + rmm::cuda_stream_view stream) override + { + CUDF_FUNC_RANGE(); + + read_size = std::min(_source->size() - _offset, read_size); + + // get a device buffer containing read data on the device. + auto chunk = rmm::device_uvector(read_size, stream); + + if (_source->supports_device_read() && _source->is_device_read_preferred(read_size)) { + _source->device_read_async( + _offset, read_size, reinterpret_cast(chunk.data()), stream); + } else { + auto& h_ticket = _tickets[_next_ticket_idx]; + + _next_ticket_idx = (_next_ticket_idx + 1) % num_tickets; + + // synchronize on the last host-to-device copy, so we don't clobber the host buffer. + CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); + + // resize the host buffer as necessary to contain the requested number of bytes + if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); } + + _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer.data())); + + // copy the host-pinned data on to device + CUDF_CUDA_TRY(cudaMemcpyAsync( + chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyHostToDevice, stream.value())); + + // record the host-to-device copy. + CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); + } + + _offset += read_size; + + // return the device buffer so it can be processed. + return std::make_unique(std::move(chunk)); + } + + private: + std::size_t _offset = 0; + std::size_t _next_ticket_idx = 0; + std::array _tickets{}; + datasource* _source; +}; + /** * @brief A reader which produces owning chunks of device memory which contain a copy of the data * from an istream. @@ -37,12 +117,14 @@ namespace { class istream_data_chunk_reader : public data_chunk_reader { struct host_ticket { cudaEvent_t event; - thrust::host_vector> buffer; + thrust::host_vector> buffer; }; + constexpr static int num_tickets = 2; + public: istream_data_chunk_reader(std::unique_ptr datastream) - : _datastream(std::move(datastream)), _tickets(2) + : _datastream(std::move(datastream)) { // create an event to track the completion of the last device-to-host copy. for (auto& ticket : _tickets) { @@ -66,7 +148,7 @@ class istream_data_chunk_reader : public data_chunk_reader { auto& h_ticket = _tickets[_next_ticket_idx]; - _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size(); + _next_ticket_idx = (_next_ticket_idx + 1) % num_tickets; // synchronize on the last host-to-device copy, so we don't clobber the host buffer. CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); @@ -84,12 +166,8 @@ class istream_data_chunk_reader : public data_chunk_reader { auto chunk = rmm::device_uvector(read_size, stream); // copy the host-pinned data on to device - CUDF_CUDA_TRY(cudaMemcpyAsync( // - chunk.data(), - h_ticket.buffer.data(), - read_size, - cudaMemcpyHostToDevice, - stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync( + chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyHostToDevice, stream.value())); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); @@ -100,8 +178,8 @@ class istream_data_chunk_reader : public data_chunk_reader { private: std::size_t _next_ticket_idx = 0; + std::array _tickets{}; std::unique_ptr _datastream; - std::vector _tickets; }; /** @@ -180,6 +258,21 @@ class device_span_data_chunk_reader : public data_chunk_reader { uint64_t _position = 0; }; +/** + * @brief A datasource-based data chunk source which creates a datasource_chunk_reader. + */ +class datasource_chunk_source : public data_chunk_source { + public: + datasource_chunk_source(datasource& source) : _source(&source) {} + [[nodiscard]] std::unique_ptr create_reader() const override + { + return std::make_unique(_source); + } + + private: + datasource* _source; +}; + /** * @brief A file data source which creates an istream_data_chunk_reader. */ @@ -228,6 +321,11 @@ class device_span_data_chunk_source : public data_chunk_source { } // namespace +std::unique_ptr make_source(datasource& data) +{ + return std::make_unique(data); +} + std::unique_ptr make_source(host_span data) { return std::make_unique(data); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 133c5fe9826..1177be6b63f 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -14,24 +14,23 @@ * limitations under the License. */ -// Can be removed once we use Thrust 1.16+ -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wpragmas" -#pragma GCC diagnostic ignored "-Wsizeof-array-div" +#include #include #include +#include #include #include #include #include #include #include +#include #include +#include #include #include -#include #include #include #include @@ -39,57 +38,19 @@ #include #include +#include +#include #include #include #include -#pragma GCC diagnostic pop - +#include +#include #include #include #include -namespace cudf { - -/** - * @brief A device span consisting of two separate device_spans acting as if they were part of a - * single span. The first head.size() entries are served from the first span, the remaining - * tail.size() entries are served from the second span. - * - * @tparam T The type of elements in the span. - */ -template -class split_device_span { - public: - explicit constexpr split_device_span(device_span head, device_span tail = {}) - : _head{head}, _tail{tail} - { - } - - [[nodiscard]] constexpr T& operator[](size_type i) - { - return i < _head.size() ? _head[i] : _tail[i - _head.size()]; - } - - [[nodiscard]] constexpr const T& operator[](size_type i) const - { - return i < _head.size() ? _head[i] : _tail[i - _head.size()]; - } - - [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); } - - [[nodiscard]] constexpr device_span head() const { return _head; } - - [[nodiscard]] constexpr device_span tail() const { return _tail; } - - private: - device_span _head; - device_span _tail; -}; - -} // namespace cudf - namespace { using cudf::io::text::detail::multistate; @@ -160,6 +121,10 @@ struct PatternScan { } }; +// type aliases to distinguish between row offsets and character offsets +using output_offset = int64_t; +using byte_offset = int64_t; + // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming // them in to data structures called "multistates". these multistates are created by searching a // trie, but instead of a tradition trie where the search begins at a single node at the beginning, @@ -170,35 +135,11 @@ struct PatternScan { // it begins in. From there, each thread can then take deterministic action. In this case, the // deterministic action is counting and outputting delimiter offsets when a delimiter is found. -// This struct provides output offsets that are only incremented until a cutoff point. -struct cutoff_offset { - // magnitude stores the offset, sign bit stores whether we are past the cutoff - int64_t value = 0; - - constexpr cutoff_offset() = default; - - constexpr cutoff_offset(int64_t offset, bool is_past_cutoff) - : value{is_past_cutoff ? -offset : offset} - { - } - - [[nodiscard]] constexpr int64_t offset() const { return value < 0 ? -value : value; } - - [[nodiscard]] constexpr bool is_past_end() { return value < 0; } - - friend constexpr cutoff_offset operator+(cutoff_offset lhs, cutoff_offset rhs) - { - auto const past_end = lhs.is_past_end() or rhs.is_past_end(); - auto const offset = lhs.offset() + (lhs.is_past_end() ? 0 : rhs.offset()); - return cutoff_offset{offset, past_end}; - } -}; - __global__ void multibyte_split_init_kernel( cudf::size_type base_tile_idx, cudf::size_type num_tiles, cudf::io::text::detail::scan_tile_state_view tile_multistates, - cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, cudf::io::text::detail::scan_tile_status status = cudf::io::text::detail::scan_tile_status::invalid) { @@ -212,9 +153,9 @@ __global__ void multibyte_split_init_kernel( __global__ void multibyte_split_seed_kernel( cudf::io::text::detail::scan_tile_state_view tile_multistates, - cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, multistate tile_multistate_seed, - cutoff_offset tile_output_offset) + output_offset tile_output_offset) { auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x; if (thread_idx == 0) { @@ -225,19 +166,18 @@ __global__ void multibyte_split_seed_kernel( __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( cudf::size_type base_tile_idx, - int64_t base_input_offset, - int64_t base_offset_offset, + byte_offset base_input_offset, + output_offset base_output_offset, cudf::io::text::detail::scan_tile_state_view tile_multistates, - cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, cudf::device_span delim, cudf::device_span chunk_input_chars, - int64_t byte_range_end, - cudf::split_device_span output_offsets) + cudf::split_device_span row_offsets) { using InputLoad = cub::BlockLoad; - using OffsetScan = cub::BlockScan; - using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback; + using OffsetScan = cub::BlockScan; + using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback; __shared__ union { typename InputLoad::TempStorage input_load; @@ -269,17 +209,15 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( // STEP 3: Flag matches - cutoff_offset thread_offset; + output_offset thread_offset{}; uint32_t thread_match_mask[(ITEMS_PER_THREAD + 31) / 32]{}; for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { - thread_multistate = transition(thread_chars[i], thread_multistate, delim); - auto const thread_state = thread_multistate.max_tail(); - auto const is_match = i < thread_input_size and thread_state == delim.size(); - auto const match_end = base_input_offset + thread_input_offset + i + 1; - auto const is_past_range = match_end >= byte_range_end; + thread_multistate = transition(thread_chars[i], thread_multistate, delim); + auto const thread_state = thread_multistate.max_tail(); + auto const is_match = i < thread_input_size and thread_state == delim.size(); thread_match_mask[i / 32] |= uint32_t{is_match} << (i % 32); - thread_offset = thread_offset + cutoff_offset{is_match, is_past_range}; + thread_offset += output_offset{is_match}; } // STEP 4: Scan flags to determine absolute thread output offset @@ -293,29 +231,27 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { auto const is_match = (thread_match_mask[i / 32] >> (i % 32)) & 1u; - if (is_match && !thread_offset.is_past_end()) { - auto const match_end = base_input_offset + thread_input_offset + i + 1; - auto const is_past_range = match_end >= byte_range_end; - output_offsets[thread_offset.offset() - base_offset_offset] = match_end; - thread_offset = thread_offset + cutoff_offset{true, is_past_range}; + if (is_match) { + auto const match_end = base_input_offset + thread_input_offset + i + 1; + row_offsets[thread_offset - base_output_offset] = match_end; + thread_offset++; } } } __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( cudf::size_type base_tile_idx, - int64_t base_input_offset, - int64_t base_offset_offset, - cudf::io::text::detail::scan_tile_state_view tile_output_offsets, + byte_offset base_input_offset, + output_offset base_output_offset, + cudf::io::text::detail::scan_tile_state_view tile_output_offsets, char delim, cudf::device_span chunk_input_chars, - int64_t byte_range_end, - cudf::split_device_span output_offsets) + cudf::split_device_span row_offsets) { using InputLoad = cub::BlockLoad; - using OffsetScan = cub::BlockScan; - using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback; + using OffsetScan = cub::BlockScan; + using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback; __shared__ union { typename InputLoad::TempStorage input_load; @@ -338,15 +274,13 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( // STEP 2: Flag matches - cutoff_offset thread_offset; + output_offset thread_offset{}; uint32_t thread_match_mask[(ITEMS_PER_THREAD + 31) / 32]{}; for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { - auto const is_match = i < thread_input_size and thread_chars[i] == delim; - auto const match_end = base_input_offset + thread_input_offset + i + 1; - auto const is_past_range = match_end >= byte_range_end; + auto const is_match = i < thread_input_size and thread_chars[i] == delim; thread_match_mask[i / 32] |= uint32_t{is_match} << (i % 32); - thread_offset = thread_offset + cutoff_offset{is_match, is_past_range}; + thread_offset += output_offset{is_match}; } // STEP 3: Scan flags to determine absolute thread output offset @@ -360,11 +294,10 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) { auto const is_match = (thread_match_mask[i / 32] >> (i % 32)) & 1u; - if (is_match && !thread_offset.is_past_end()) { - auto const match_end = base_input_offset + thread_input_offset + i + 1; - auto const is_past_range = match_end >= byte_range_end; - output_offsets[thread_offset.offset() - base_offset_offset] = match_end; - thread_offset = thread_offset + cutoff_offset{true, is_past_range}; + if (is_match) { + auto const match_end = base_input_offset + thread_input_offset + i + 1; + row_offsets[thread_offset - base_output_offset] = match_end; + thread_offset++; } } } @@ -407,173 +340,10 @@ std::vector get_streams(int32_t count, rmm::cuda_stream_p return streams; } -/** - * @brief A chunked storage class that provides preallocated memory for algorithms with known - * worst-case output size. It provides functionality to retrieve the next chunk to write to, for - * reporting how much memory was actually written and for gathering all previously written outputs - * into a single contiguous vector. - * - * @tparam T The output element type. - */ -template -class output_builder { - public: - using size_type = typename rmm::device_uvector::size_type; - - /** - * @brief Initializes an output builder with given worst-case output size and stream. - * - * @param max_write_size the maximum number of elements that will be written into a - * split_device_span returned from `next_output`. - * @param stream the stream used to allocate the first chunk of memory. - * @param mr optional, the memory resource to use for allocation. - */ - output_builder(size_type max_write_size, - size_type max_growth, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) - : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth} - { - CUDF_EXPECTS(max_write_size > 0, "Internal error"); - _chunks.emplace_back(0, stream, mr); - _chunks.back().reserve(max_write_size * 2, stream); - } - - output_builder(output_builder&&) = delete; - output_builder(const output_builder&) = delete; - output_builder& operator=(output_builder&&) = delete; - output_builder& operator=(const output_builder&) = delete; - - /** - * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage. - * Must be followed by a call to `advance_output` after the memory has been written to. - * - * @param stream The stream to allocate a new chunk of memory with, if necessary. - * This should be the stream that will write to the `split_device_span`. - * @return A `split_device_span` starting directly after the last output and providing at least - * `max_write_size` entries of storage. - */ - [[nodiscard]] split_device_span next_output(rmm::cuda_stream_view stream) - { - auto head_it = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1); - auto head_span = get_free_span(*head_it); - if (head_span.size() >= _max_write_size) { return split_device_span{head_span}; } - if (head_it == _chunks.end() - 1) { - // insert a new vector of double size - auto const next_chunk_size = - std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity()); - _chunks.emplace_back(0, stream, _chunks.back().memory_resource()); - _chunks.back().reserve(next_chunk_size, stream); - } - auto tail_span = get_free_span(_chunks.back()); - CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error"); - return split_device_span{head_span, tail_span}; - } - - /** - * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was - * written to. - * - * @param actual_size The number of elements that were written to the result of the previous - * `next_output` call. - */ - void advance_output(size_type actual_size) - { - CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error"); - if (_chunks.size() < 2) { - auto const new_size = _chunks.back().size() + actual_size; - inplace_resize(_chunks.back(), new_size); - } else { - auto& tail = _chunks.back(); - auto& prev = _chunks.rbegin()[1]; - auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size()); - auto const tail_advance = actual_size - prev_advance; - inplace_resize(prev, prev.size() + prev_advance); - inplace_resize(tail, tail.size() + tail_advance); - } - _size += actual_size; - } - - /** - * @brief Returns the first element that was written to the output. - * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. - * @param stream The stream used to access the element. - * @return The first element that was written to the output. - */ - [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const - { - return _chunks.front().front_element(stream); - } - - /** - * @brief Returns the last element that was written to the output. - * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. - * @param stream The stream used to access the element. - * @return The last element that was written to the output. - */ - [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const - { - auto const& last_nonempty_chunk = - _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back(); - return last_nonempty_chunk.back_element(stream); - } - - [[nodiscard]] size_type size() const { return _size; } - - /** - * @brief Gathers all previously written outputs into a single contiguous vector. - * - * @param stream The stream used to allocate and gather the output vector. All previous write - * operations to the output buffer must have finished or happened on this stream. - * @param mr The memory resource used to allocate the output vector. - * @return The output vector. - */ - rmm::device_uvector gather(rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - rmm::device_uvector output{size(), stream, mr}; - auto output_it = output.begin(); - for (auto const& chunk : _chunks) { - output_it = thrust::copy( - rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it); - } - return output; - } - - private: - /** - * @brief Resizes a vector without reallocating - * - * @param vector The vector - * @param new_size The new size. Must be smaller than the vector's capacity - */ - static void inplace_resize(rmm::device_uvector& vector, size_type new_size) - { - CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error"); - vector.resize(new_size, rmm::cuda_stream_view{}); - } - - /** - * @brief Returns the span consisting of all currently unused elements in the vector - * (`i >= size() and i < capacity()`). - * - * @param vector The vector. - * @return The span of unused elements. - */ - static device_span get_free_span(rmm::device_uvector& vector) - { - return device_span{vector.data() + vector.size(), vector.capacity() - vector.size()}; - } - - size_type _size; - size_type _max_write_size; - size_type _max_growth; - std::vector> _chunks; -}; - std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, std::string const& delimiter, byte_range_info byte_range, + bool strip_delimiters, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr, rmm::cuda_stream_pool& stream_pool) @@ -611,7 +381,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32); auto tile_multistates = scan_tile_state(num_tile_states, stream); - auto tile_offsets = scan_tile_state(num_tile_states, stream); + auto tile_offsets = scan_tile_state(num_tile_states, stream); multibyte_split_init_kernel<< multibyte_split(cudf::io::text::data_chunk_source tile_multistates, tile_offsets, multistate_seed, - {}); + 0); auto reader = source.create_reader(); - auto chunk_offset = std::max(0, byte_range.offset() - delimiter.size()); + auto chunk_offset = std::max(0, byte_range.offset() - delimiter.size()); auto const byte_range_end = byte_range.offset() + byte_range.size(); reader->skip_bytes(chunk_offset); // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation constexpr auto max_growth = 8; - output_builder offset_storage(ITEMS_PER_CHUNK, max_growth, stream); + output_builder row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream); output_builder char_storage(ITEMS_PER_CHUNK, max_growth, stream); fork_stream(streams, stream); @@ -653,22 +423,23 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source auto& scan_stream = streams[1]; auto chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream); int64_t base_tile_idx = 0; - std::optional first_offset; - std::optional last_offset; - if (byte_range.offset() == 0) { first_offset = 0; } + std::optional first_row_offset; + std::optional last_row_offset; + bool found_last_offset = false; + if (byte_range.offset() == 0) { first_row_offset = 0; } std::swap(read_stream, scan_stream); while (chunk->size() > 0) { // if we found the last delimiter, or didn't find delimiters inside the byte range at all: abort - if (last_offset.has_value() or - (not first_offset.has_value() and chunk_offset >= byte_range_end)) { + if (last_row_offset.has_value() or + (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) { break; } auto tiles_in_launch = cudf::util::div_rounding_up_safe(chunk->size(), static_cast(ITEMS_PER_TILE)); - auto offset_output = offset_storage.next_output(scan_stream); + auto row_offsets = row_offset_storage.next_output(scan_stream); // reset the next chunk of tile state multibyte_split_init_kernel<< multibyte_split(cudf::io::text::data_chunk_source scan_stream.value()>>>( // base_tile_idx, chunk_offset, - offset_storage.size(), + row_offset_storage.size(), tile_offsets, delimiter[0], *chunk, - byte_range_end, - offset_output); + row_offsets); } else { multibyte_split_kernel<< multibyte_split(cudf::io::text::data_chunk_source scan_stream.value()>>>( // base_tile_idx, chunk_offset, - offset_storage.size(), + row_offset_storage.size(), tile_multistates, tile_offsets, {device_delim.data(), static_cast(device_delim.size())}, *chunk, - byte_range_end, - offset_output); + row_offsets); } // load the next chunk auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream); // while that is running, determine how many offsets we output (synchronizes) - auto next_tile_offset = - tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream); - offset_storage.advance_output(next_tile_offset.offset() - offset_storage.size()); + auto const new_offsets = [&] { + auto const new_offsets_unclamped = + tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) - + static_cast(row_offset_storage.size()); + // if we are not in the last chunk, we can use all offsets + if (chunk_offset + static_cast(chunk->size()) < byte_range_end) { + return new_offsets_unclamped; + } + // if we are in the last chunk, we need to find the first out-of-bounds offset + auto const it = thrust::make_counting_iterator(output_offset{}); + auto const end_loc = + *thrust::find_if(rmm::exec_policy_nosync(scan_stream), + it, + it + new_offsets_unclamped, + [row_offsets, byte_range_end] __device__(output_offset i) { + return row_offsets[i] >= byte_range_end; + }); + // if we had no out-of-bounds offset, we copy all offsets + if (end_loc == new_offsets_unclamped) { return end_loc; } + // otherwise we copy only up to (including) the first out-of-bounds delimiter + found_last_offset = true; + return end_loc + 1; + }(); + row_offset_storage.advance_output(new_offsets, scan_stream); // determine if we found the first or last field offset for the byte range - if (next_tile_offset.offset() > 0 and not first_offset) { - first_offset = offset_storage.front_element(scan_stream); + if (new_offsets > 0 and not first_row_offset) { + first_row_offset = row_offset_storage.front_element(scan_stream); } - if (next_tile_offset.is_past_end()) { last_offset = offset_storage.back_element(scan_stream); } + if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); } // copy over the characters we need, if we already encountered the first field delimiter - if (first_offset.has_value()) { - auto const begin = chunk->data() + std::max(0, *first_offset - chunk_offset); - auto const sentinel = last_offset.value_or(std::numeric_limits::max()); - auto const end = chunk->data() + std::min(sentinel - chunk_offset, chunk->size()); + if (first_row_offset.has_value()) { + auto const begin = chunk->data() + std::max(0, *first_row_offset - chunk_offset); + auto const sentinel = last_row_offset.value_or(std::numeric_limits::max()); + auto const end = + chunk->data() + std::min(sentinel - chunk_offset, chunk->size()); auto const output_size = end - begin; auto char_output = char_storage.next_output(scan_stream); - auto const split = begin + std::min(output_size, char_output.head().size()); - thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, split, char_output.head().begin()); - thrust::copy(rmm::exec_policy_nosync(scan_stream), split, end, char_output.tail().begin()); - char_storage.advance_output(output_size); + thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin()); + char_storage.advance_output(output_size, scan_stream); } cudaEventRecord(last_launch_event, scan_stream.value()); std::swap(read_stream, scan_stream); - base_tile_idx += TILES_PER_CHUNK; + base_tile_idx += tiles_in_launch; chunk_offset += chunk->size(); chunk = std::move(next_chunk); } @@ -750,30 +539,54 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source // if the input was empty, we didn't find a delimiter at all, // or the first delimiter was also the last: empty output - if (chunk_offset == 0 or not first_offset.has_value() or first_offset == last_offset) { + if (chunk_offset == 0 or not first_row_offset.has_value() or + first_row_offset == last_row_offset) { return make_empty_column(type_id::STRING); } auto chars = char_storage.gather(stream, mr); - auto global_offsets = offset_storage.gather(stream, mr); - - bool const insert_begin = *first_offset == 0; - bool const insert_end = not last_offset.has_value() or last_offset == chunk_offset; + auto global_offsets = row_offset_storage.gather(stream, mr); + + // insert an offset at the beginning if we started at the beginning of the input + bool const insert_begin = first_row_offset.value_or(0) == 0; + // insert an offset at the end if we have not terminated the last row + bool const insert_end = + not(last_row_offset.has_value() or + (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset)); rmm::device_uvector offsets{ global_offsets.size() + insert_begin + insert_end, stream, mr}; if (insert_begin) { offsets.set_element_to_zero_async(0, stream); } - if (insert_end) { offsets.set_element(offsets.size() - 1, chunk_offset - *first_offset, stream); } + if (insert_end) { + offsets.set_element(offsets.size() - 1, chunk_offset - *first_row_offset, stream); + } thrust::transform(rmm::exec_policy(stream), global_offsets.begin(), global_offsets.end(), offsets.begin() + insert_begin, - [baseline = *first_offset] __device__(int64_t global_offset) { + [baseline = *first_row_offset] __device__(byte_offset global_offset) { return static_cast(global_offset - baseline); }); - auto string_count = offsets.size() - 1; - - return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars)); + if (strip_delimiters) { + auto it = cudf::detail::make_counting_transform_iterator( + 0, + [ofs = offsets.data(), + chars = chars.data(), + delim_size = static_cast(delimiter.size()), + last_row = static_cast(string_count) - 1, + insert_end] __device__(size_type row) { + auto const begin = ofs[row]; + auto const len = ofs[row + 1] - begin; + if (row == last_row && insert_end) { + return thrust::make_pair(chars + begin, len); + } else { + return thrust::make_pair(chars + begin, std::max(0, len - delim_size)); + }; + }); + return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr); + } else { + return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars)); + } } } // namespace detail @@ -783,11 +596,20 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source std::optional byte_range, rmm::mr::device_memory_resource* mr) { - auto stream = cudf::default_stream_value; + return multibyte_split( + source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr); +} + +std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source const& source, + std::string const& delimiter, + parse_options options, + rmm::mr::device_memory_resource* mr) +{ + auto stream = cudf::get_default_stream(); auto stream_pool = rmm::cuda_stream_pool(2); auto result = detail::multibyte_split( - source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool); + source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool); return result; } @@ -796,7 +618,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source std::string const& delimiter, rmm::mr::device_memory_resource* mr) { - return multibyte_split(source, delimiter, std::nullopt, mr); + return multibyte_split(source, delimiter, parse_options{}, mr); } } // namespace text diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index e2d209a7c0a..89ba5c598e8 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -22,6 +22,7 @@ #include "column_buffer.hpp" #include #include +#include namespace cudf { namespace io { @@ -54,6 +55,33 @@ void column_buffer::create(size_type _size, } } +namespace { + +/** + * @brief Recursively copy `name` and `user_data` fields of one buffer to another. + * + * @param buff The old output buffer + * @param new_buff The new output buffer + */ +void copy_buffer_data(column_buffer const& buff, column_buffer& new_buff) +{ + new_buff.name = buff.name; + new_buff.user_data = buff.user_data; + for (auto const& child : buff.children) { + auto& new_child = new_buff.children.emplace_back(column_buffer(child.type, child.is_nullable)); + copy_buffer_data(child, new_child); + } +} + +} // namespace + +column_buffer column_buffer::empty_like(column_buffer const& input) +{ + auto new_buff = column_buffer(input.type, input.is_nullable); + copy_buffer_data(input, new_buff); + return new_buff; +} + /** * @copydoc cudf::io::detail::make_column */ @@ -78,7 +106,19 @@ std::unique_ptr make_column(column_buffer& buffer, // convert to binary auto const string_col = make_strings_column(*buffer._strings, stream, mr); auto const num_rows = string_col->size(); - auto col_contest = string_col->release(); + auto col_content = string_col->release(); + + // convert to uint8 column, strings are currently stores as int8 + auto contents = + col_content.children[strings_column_view::chars_column_index].release()->release(); + auto data = contents.data.release(); + auto null_mask = contents.null_mask.release(); + + auto uint8_col = std::make_unique(data_type{type_id::UINT8}, + data->size(), + std::move(*data), + std::move(*null_mask), + UNKNOWN_NULL_COUNT); if (schema_info != nullptr) { schema_info->children.push_back(column_name_info{"offsets"}); @@ -87,10 +127,10 @@ std::unique_ptr make_column(column_buffer& buffer, return make_lists_column( num_rows, - std::move(col_contest.children[strings_column_view::offsets_column_index]), - std::move(col_contest.children[strings_column_view::chars_column_index]), + std::move(col_content.children[strings_column_view::offsets_column_index]), + std::move(uint8_col), UNKNOWN_NULL_COUNT, - std::move(*col_contest.null_mask)); + std::move(*col_content.null_mask)); } case type_id::LIST: { diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index 8ae3d39a3ba..8f181157fae 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -104,10 +104,14 @@ struct column_buffer { { return static_cast(_null_mask.data()); } - auto null_mask_size() { return _null_mask.size(); }; + auto null_mask_size() { return _null_mask.size(); } auto& null_count() { return _null_count; } + // Create a new column_buffer that has empty data but with the same basic information as the + // input column, including same type, nullability, name, and user_data. + static column_buffer empty_like(column_buffer const& input); + std::unique_ptr> _strings; rmm::device_buffer _data{}; rmm::device_buffer _null_mask{}; diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp index 8bd2d3a89cf..88f4e58f9b1 100644 --- a/cpp/src/io/utilities/column_type_histogram.hpp +++ b/cpp/src/io/utilities/column_type_histogram.hpp @@ -33,6 +33,11 @@ struct column_type_histogram { cudf::size_type positive_small_int_count{}; cudf::size_type big_int_count{}; cudf::size_type bool_count{}; + auto total_count() const + { + return null_count + float_count + datetime_count + string_count + negative_small_int_count + + positive_small_int_count + big_int_count + bool_count; + } }; } // namespace io diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh index fbeaaa9c0fc..598c93a1a4f 100644 --- a/cpp/src/io/utilities/column_utils.cuh +++ b/cpp/src/io/utilities/column_utils.cuh @@ -64,7 +64,7 @@ rmm::device_uvector create_leaf_column_device_views( iter, iter + parent_table_device_view.num_columns(), [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__( - size_type index) mutable { + size_type index) { col_desc[index].parent_column = parent_col_view.begin() + index; column_device_view col = parent_col_view.column(index); // traverse till leaf column @@ -74,7 +74,7 @@ rmm::device_uvector create_leaf_column_device_views( : col.child(0); // stop early if writing a byte array if (col_desc[index].stats_dtype == dtype_byte_array && - (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) { + child.type().id() == type_id::UINT8) { break; } col = child; diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index c0dd85702e2..2484a36143a 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -257,11 +257,20 @@ std::future cufile_output_impl::write_async(void const* data, size_t offse // writes. return std::async(std::launch::deferred, waiter, std::move(slice_tasks)); } +#else +cufile_input_impl::cufile_input_impl(std::string const& filepath) +{ + CUDF_FAIL("Cannot create cuFile source, current build was compiled without cuFile headers"); +} + +cufile_output_impl::cufile_output_impl(std::string const& filepath) +{ + CUDF_FAIL("Cannot create cuFile sink, current build was compiled without cuFile headers"); +} #endif std::unique_ptr make_cufile_input(std::string const& filepath) { -#ifdef CUFILE_FOUND if (cufile_integration::is_gds_enabled()) { try { return std::make_unique(filepath); @@ -269,13 +278,11 @@ std::unique_ptr make_cufile_input(std::string const& filepath if (cufile_integration::is_always_enabled()) throw; } } -#endif return nullptr; } std::unique_ptr make_cufile_output(std::string const& filepath) { -#ifdef CUFILE_FOUND if (cufile_integration::is_gds_enabled()) { try { return std::make_unique(filepath); @@ -283,7 +290,6 @@ std::unique_ptr make_cufile_output(std::string const& filepa if (cufile_integration::is_always_enabled()) throw; } } -#endif return nullptr; } diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 704ee77de8a..38674892966 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -194,6 +194,7 @@ class cufile_output_impl final : public cufile_output { class cufile_input_impl final : public cufile_input { public: + cufile_input_impl(std::string const& filepath); std::future read_async(size_t offset, size_t size, uint8_t* dst, @@ -205,6 +206,7 @@ class cufile_input_impl final : public cufile_input { class cufile_output_impl final : public cufile_output { public: + cufile_output_impl(std::string const& filepath); std::future write_async(void const* data, size_t offset, size_t size) override { CUDF_FAIL("Only used to compile without cufile library, should not be called"); diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index b5e59871119..77dade24009 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -24,7 +25,6 @@ #include #include -#include /** * @brief A helper class that wraps fixed-length device memory for the GPU, and @@ -40,7 +40,7 @@ class hostdevice_vector { public: using value_type = T; - hostdevice_vector() : hostdevice_vector(0, cudf::default_stream_value) {} + hostdevice_vector() : hostdevice_vector(0, cudf::get_default_stream()) {} explicit hostdevice_vector(size_t size, rmm::cuda_stream_view stream) : hostdevice_vector(size, size, stream) @@ -126,7 +126,7 @@ class hostdevice_vector { } private: - thrust::host_vector> h_data; + thrust::host_vector> h_data; rmm::device_uvector d_data; }; diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh new file mode 100644 index 00000000000..e45143480fc --- /dev/null +++ b/cpp/src/io/utilities/output_builder.cuh @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +namespace cudf { + +template +class split_device_span_iterator; + +/** + * @brief A device span consisting of two separate device_spans acting as if they were part of a + * single span. The first head.size() entries are served from the first span, the remaining + * tail.size() entries are served from the second span. + * + * @tparam T The type of elements in the span. + */ +template +class split_device_span { + public: + using element_type = T; + using value_type = std::remove_cv; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using pointer = T*; + using iterator = split_device_span_iterator; + using const_pointer = T const*; + using reference = T&; + using const_reference = T const&; + + split_device_span() = default; + + explicit constexpr split_device_span(device_span head, device_span tail = {}) + : _head{head}, _tail{tail} + { + } + + [[nodiscard]] constexpr reference operator[](size_type i) const + { + return i < _head.size() ? _head[i] : _tail[i - _head.size()]; + } + + [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); } + + [[nodiscard]] constexpr device_span head() const { return _head; } + + [[nodiscard]] constexpr device_span tail() const { return _tail; } + + [[nodiscard]] constexpr iterator begin() const; + + [[nodiscard]] constexpr iterator end() const; + + private: + device_span _head; + device_span _tail; +}; + +/** + * @brief A random access iterator indexing into a split_device_span. + * + * @tparam T The type of elements in the underlying span. + */ +template +class split_device_span_iterator { + using it = split_device_span_iterator; + + public: + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using value_type = T; + using pointer = value_type*; + using reference = value_type&; + using iterator_category = std::random_access_iterator_tag; + + split_device_span_iterator() = default; + + constexpr split_device_span_iterator(split_device_span span, size_type offset) + : _span{span}, _offset{offset} + { + } + + [[nodiscard]] constexpr reference operator*() const { return _span[_offset]; } + + [[nodiscard]] constexpr reference operator[](size_type i) const { return _span[_offset + i]; } + + [[nodiscard]] constexpr friend bool operator==(const it& lhs, const it& rhs) + { + return lhs._offset == rhs._offset; + } + + [[nodiscard]] constexpr friend bool operator!=(const it& lhs, const it& rhs) + { + return !(lhs == rhs); + } + [[nodiscard]] constexpr friend bool operator<(const it& lhs, const it& rhs) + { + return lhs._offset < rhs._offset; + } + + [[nodiscard]] constexpr friend bool operator>=(const it& lhs, const it& rhs) + { + return !(lhs < rhs); + } + + [[nodiscard]] constexpr friend bool operator>(const it& lhs, const it& rhs) { return rhs < lhs; } + + [[nodiscard]] constexpr friend bool operator<=(const it& lhs, const it& rhs) + { + return !(lhs > rhs); + } + + [[nodiscard]] constexpr friend difference_type operator-(const it& lhs, const it& rhs) + { + return lhs._offset - rhs._offset; + } + + [[nodiscard]] constexpr friend it operator+(it lhs, difference_type i) { return lhs += i; } + + constexpr it& operator+=(difference_type i) + { + _offset += i; + return *this; + } + + constexpr it& operator-=(difference_type i) { return *this += -i; } + + constexpr it& operator++() { return *this += 1; } + + constexpr it& operator--() { return *this -= 1; } + + constexpr it operator++(int) + { + auto result = *this; + ++*this; + return result; + } + + constexpr it operator--(int) + { + auto result = *this; + --*this; + return result; + } + + private: + split_device_span _span; + size_type _offset; +}; + +template +[[nodiscard]] constexpr split_device_span_iterator split_device_span::begin() const +{ + return {*this, 0}; +} + +template +[[nodiscard]] constexpr split_device_span_iterator split_device_span::end() const +{ + return {*this, size()}; +} + +/** + * @brief A chunked storage class that provides preallocated memory for algorithms with known + * worst-case output size. It provides functionality to retrieve the next chunk to write to, for + * reporting how much memory was actually written and for gathering all previously written outputs + * into a single contiguous vector. + * + * @tparam T The output element type. + */ +template +class output_builder { + public: + using size_type = typename rmm::device_uvector::size_type; + + /** + * @brief Initializes an output builder with given worst-case output size and stream. + * + * @param max_write_size the maximum number of elements that will be written into a + * split_device_span returned from `next_output`. + * @param stream the stream used to allocate the first chunk of memory. + * @param mr optional, the memory resource to use for allocation. + */ + output_builder(size_type max_write_size, + size_type max_growth, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) + : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth} + { + CUDF_EXPECTS(max_write_size > 0, "Internal error"); + _chunks.emplace_back(0, stream, mr); + _chunks.back().reserve(max_write_size * 2, stream); + } + + output_builder(output_builder&&) = delete; + output_builder(const output_builder&) = delete; + output_builder& operator=(output_builder&&) = delete; + output_builder& operator=(const output_builder&) = delete; + + /** + * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage. + * Must be followed by a call to `advance_output` after the memory has been written to. + * + * @param stream The stream to allocate a new chunk of memory with, if necessary. + * This should be the stream that will write to the `split_device_span`. + * @return A `split_device_span` starting directly after the last output and providing at least + * `max_write_size` entries of storage. + */ + [[nodiscard]] split_device_span next_output(rmm::cuda_stream_view stream) + { + auto head_it = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1); + auto head_span = get_free_span(*head_it); + if (head_span.size() >= _max_write_size) { return split_device_span{head_span}; } + if (head_it == _chunks.end() - 1) { + // insert a new device_uvector of double size + auto const next_chunk_size = + std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity()); + _chunks.emplace_back(0, stream, _chunks.back().memory_resource()); + _chunks.back().reserve(next_chunk_size, stream); + } + auto tail_span = get_free_span(_chunks.back()); + CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error"); + return split_device_span{head_span, tail_span}; + } + + /** + * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was + * written to. + * + * @param actual_size The number of elements that were written to the result of the previous + * `next_output` call. + * @param stream The stream on which to resize the vectors. Since this function will not + * reallocate, this only changes the stream of the internally stored vectors, + * impacting their subsequent copy and destruction behavior. + */ + void advance_output(size_type actual_size, rmm::cuda_stream_view stream) + { + CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error"); + if (_chunks.size() < 2) { + auto const new_size = _chunks.back().size() + actual_size; + inplace_resize(_chunks.back(), new_size, stream); + } else { + auto& tail = _chunks.back(); + auto& prev = _chunks.rbegin()[1]; + auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size()); + auto const tail_advance = actual_size - prev_advance; + inplace_resize(prev, prev.size() + prev_advance, stream); + inplace_resize(tail, tail.size() + tail_advance, stream); + } + _size += actual_size; + } + + /** + * @brief Returns the first element that was written to the output. + * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. + * @param stream The stream used to access the element. + * @return The first element that was written to the output. + */ + [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const + { + return _chunks.front().front_element(stream); + } + + /** + * @brief Returns the last element that was written to the output. + * Requires a previous call to `next_output` and `advance_output` and `size() > 0`. + * @param stream The stream used to access the element. + * @return The last element that was written to the output. + */ + [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const + { + auto const& last_nonempty_chunk = + _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back(); + return last_nonempty_chunk.back_element(stream); + } + + [[nodiscard]] size_type size() const { return _size; } + + /** + * @brief Gathers all previously written outputs into a single contiguous vector. + * + * @param stream The stream used to allocate and gather the output vector. All previous write + * operations to the output buffer must have finished or happened on this stream. + * @param mr The memory resource used to allocate the output vector. + * @return The output vector. + */ + rmm::device_uvector gather(rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const + { + rmm::device_uvector output{size(), stream, mr}; + auto output_it = output.begin(); + for (auto const& chunk : _chunks) { + output_it = thrust::copy( + rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it); + } + return output; + } + + private: + /** + * @brief Resizes a vector without reallocating + * + * @param vector The vector + * @param new_size The new size. Must be smaller than the vector's capacity + * @param stream The stream on which to resize the vector. Since this function will not + * reallocate, this only changes the stream of `vector`, impacting its subsequent + * copy and destruction behavior. + */ + static void inplace_resize(rmm::device_uvector& vector, + size_type new_size, + rmm::cuda_stream_view stream) + { + CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error"); + vector.resize(new_size, stream); + } + + /** + * @brief Returns the span consisting of all currently unused elements in the vector + * (`i >= size() and i < capacity()`). + * + * @param vector The vector. + * @return The span of unused elements. + */ + static device_span get_free_span(rmm::device_uvector& vector) + { + return device_span{vector.data() + vector.size(), vector.capacity() - vector.size()}; + } + + size_type _size; + size_type _max_write_size; + size_type _max_growth; + std::vector> _chunks; +}; + +} // namespace cudf diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 388c9b28001..89806956ae5 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -117,8 +117,9 @@ struct parse_options { }; /** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for integral types. Handles hexadecimal digits, both uppercase and lowercase. + * @brief Returns the numeric value of an ASCII/UTF-8 character. + * Handles hexadecimal digits, both uppercase and lowercase + * for integral types and only decimal digits for floating point types. * If the character is not a valid numeric digit then `0` is returned and * valid_flag is set to false. * @@ -127,31 +128,14 @@ struct parse_options { * * @return uint8_t Numeric value of the character, or `0` */ -template )> -constexpr uint8_t decode_digit(char c, bool* valid_flag) -{ - if (c >= '0' && c <= '9') return c - '0'; - if (c >= 'a' && c <= 'f') return c - 'a' + 10; - if (c >= 'A' && c <= 'F') return c - 'A' + 10; - - *valid_flag = false; - return 0; -} - -/** - * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization - * for non-integral types. Handles only decimal digits. If the character is not - * a valid numeric digit then `0` is returned and valid_flag is set to false. - * - * @param c ASCII or UTF-8 character - * @param valid_flag Set to false if input is not valid. Unchanged otherwise. - * - * @return uint8_t Numeric value of the character, or `0` - */ -template )> +template constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; + if constexpr (as_hex and std::is_integral_v) { + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + } *valid_flag = false; return 0; @@ -194,13 +178,13 @@ constexpr bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -constexpr T parse_numeric(char const* begin, - char const* end, - parse_options_view const& opts, - T error_result = std::numeric_limits::quiet_NaN()) +__host__ __device__ std::optional parse_numeric(char const* begin, + char const* end, + parse_options_view const& opts) { T value{}; bool all_digits_valid = true; + constexpr bool as_hex = (base == 16); // Handle negative values if necessary int32_t sign = (*begin == '-') ? -1 : 1; @@ -223,7 +207,7 @@ constexpr T parse_numeric(char const* begin, } else if (base == 10 && (*begin == 'e' || *begin == 'E')) { break; } else if (*begin != opts.thousands && *begin != '+') { - value = (value * base) + decode_digit(*begin, &all_digits_valid); + value = (value * base) + decode_digit(*begin, &all_digits_valid); } ++begin; } @@ -237,7 +221,7 @@ constexpr T parse_numeric(char const* begin, break; } else if (*begin != opts.thousands && *begin != '+') { divisor /= base; - value += decode_digit(*begin, &all_digits_valid) * divisor; + value += decode_digit(*begin, &all_digits_valid) * divisor; } ++begin; } @@ -248,12 +232,12 @@ constexpr T parse_numeric(char const* begin, if (*begin == '-' || *begin == '+') { ++begin; } int32_t exponent = 0; while (begin < end) { - exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); + exponent = (exponent * 10) + decode_digit(*(begin++), &all_digits_valid); } if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return error_result; } + if (!all_digits_valid) { return std::optional{}; } return value * sign; } @@ -485,7 +469,7 @@ cudf::size_type count_all_from_set(host_span data, /** * @brief Checks whether the given character is a whitespace character. * - * @param[in] ch The character to check + * @param ch The character to check * * @return True if the input is whitespace, False otherwise */ @@ -503,9 +487,9 @@ __inline__ __device__ It skip_character(It const& it, char ch) /** * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters. * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes; '\0' if none + * @param begin Pointer to the first character in the parsing range + * @param end Pointer to the first character after the parsing range + * @param quotechar The character used to denote quotes; '\0' if none * * @return Trimmed range */ @@ -524,62 +508,47 @@ __inline__ __device__ std::pair trim_whitespaces_quote } /** - * @brief Decodes a numeric value base on templated cudf type T with specified - * base. + * @brief Adjusts the range to ignore starting/trailing whitespace characters. * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options + * @param begin Pointer to the first character in the parsing range + * @param end Pointer to the first character after the parsing range * - * @return The parsed numeric value + * @return Trimmed range */ -template -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) +__inline__ __device__ std::pair trim_whitespaces(char const* begin, + char const* end) { - return cudf::io::parse_numeric(begin, end, opts); + auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); }; + + auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace); + auto const trim_end = thrust::find_if(thrust::seq, + thrust::make_reverse_iterator(end), + thrust::make_reverse_iterator(trim_begin), + not_whitespace); + + return {trim_begin, trim_end.base()}; } /** - * @brief Decodes a numeric value base on templated cudf type T + * @brief Adjusts the range to ignore starting/trailing quotation characters. * - * @param[in] begin Beginning of the character string - * @param[in] end End of the character string - * @param opts The global parsing behavior options + * @param begin Pointer to the first character in the parsing range + * @param end Pointer to the first character after the parsing range + * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be + * trimmed. * - * @return The parsed numeric value + * @return Trimmed range */ -template () and !cudf::is_duration())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - return cudf::io::parse_numeric(begin, end, opts); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, - char const* end, - parse_options_view const& opts) -{ - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { - thrust::advance(begin, 1); - thrust::advance(end, -1); - } - return to_timestamp(begin, end, opts.dayfirst); -} - -template ())> -__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&) +__inline__ __device__ std::pair trim_quotes(char const* begin, + char const* end, + char quotechar) { - // If this is a string value, remove quotes - if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) { + if ((thrust::distance(begin, end) >= 2 && *begin == quotechar && + *thrust::prev(end) == quotechar)) { thrust::advance(begin, 1); thrust::advance(end, -1); } - return to_duration(begin, end); + return {begin, end}; } struct ConvertFunctor { @@ -601,13 +570,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex = false) { - static_cast(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T { + auto const value = [as_hex, &opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; } if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; } - return as_hex ? decode_value(begin, end, opts) : decode_value(begin, end, opts); + return as_hex ? cudf::io::parse_numeric(begin, end, opts) + : cudf::io::parse_numeric(begin, end, opts); }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); return true; } @@ -626,6 +597,7 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { + // TODO decide what's invalid input and update parsing functions static_cast*>(out_buffer)[row] = [&opts, output_type, begin, end]() -> device_storage_type_t { return strings::detail::parse_decimal>( @@ -647,13 +619,18 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - static_cast(out_buffer)[row] = [&opts, begin, end]() { + auto const value = [&opts, begin, end]() -> std::optional { // Check for user-specified true/false values auto const field_len = static_cast(end - begin); - if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; } - if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; } - return decode_value(begin, end, opts); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { + return static_cast(true); + } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { + return static_cast(false); + } + return cudf::io::parse_numeric(begin, end, opts); }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); return true; } @@ -671,10 +648,20 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - T const value = decode_value(begin, end, opts); - static_cast(out_buffer)[row] = value; + auto const value = [&opts, begin, end]() -> std::optional { + // Check for user-specified true/false values + auto const field_len = static_cast(end - begin); + if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { + return static_cast(true); + } + if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { + return static_cast(false); + } + return cudf::io::parse_numeric(begin, end, opts); + }(); + static_cast(out_buffer)[row] = value.value_or(std::numeric_limits::quiet_NaN()); - return !std::isnan(value); + return value.has_value() and !std::isnan(*value); } /** @@ -691,12 +678,15 @@ struct ConvertFunctor { parse_options_view const& opts, bool as_hex) { - if constexpr (cudf::is_timestamp() or cudf::is_duration()) { - static_cast(out_buffer)[row] = decode_value(begin, end, opts); - return true; + // TODO decide what's invalid input and update parsing functions + if constexpr (cudf::is_timestamp()) { + static_cast(out_buffer)[row] = to_timestamp(begin, end, opts.dayfirst); + } else if constexpr (cudf::is_duration()) { + static_cast(out_buffer)[row] = to_duration(begin, end); } else { return false; } + return true; } }; diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index f0b66559799..cf1476d8bcc 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -298,7 +298,7 @@ conditional_inner_join(table_view const& left, binary_predicate, detail::join_kind::INNER_JOIN, output_size, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -316,7 +316,7 @@ conditional_left_join(table_view const& left, binary_predicate, detail::join_kind::LEFT_JOIN, output_size, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -333,7 +333,7 @@ conditional_full_join(table_view const& left, binary_predicate, detail::join_kind::FULL_JOIN, {}, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -350,7 +350,7 @@ std::unique_ptr> conditional_left_semi_join( binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, output_size, - cudf::default_stream_value, + cudf::get_default_stream(), mr) .first); } @@ -368,7 +368,7 @@ std::unique_ptr> conditional_left_anti_join( binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, output_size, - cudf::default_stream_value, + cudf::get_default_stream(), mr) .first); } @@ -380,7 +380,7 @@ std::size_t conditional_inner_join_size(table_view const& left, { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::default_stream_value, mr); + left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr); } std::size_t conditional_left_join_size(table_view const& left, @@ -390,7 +390,7 @@ std::size_t conditional_left_join_size(table_view const& left, { CUDF_FUNC_RANGE(); return detail::compute_conditional_join_output_size( - left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::default_stream_value, mr); + left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr); } std::size_t conditional_left_semi_join_size(table_view const& left, @@ -403,7 +403,7 @@ std::size_t conditional_left_semi_join_size(table_view const& left, right, binary_predicate, detail::join_kind::LEFT_SEMI_JOIN, - cudf::default_stream_value, + cudf::get_default_stream(), mr)); } @@ -417,7 +417,7 @@ std::size_t conditional_left_anti_join_size(table_view const& left, right, binary_predicate, detail::join_kind::LEFT_ANTI_JOIN, - cudf::default_stream_value, + cudf::get_default_stream(), mr)); } diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp index 6de2664b5f6..7c329cd8e17 100644 --- a/cpp/src/join/conditional_join.hpp +++ b/cpp/src/join/conditional_join.hpp @@ -48,7 +48,7 @@ conditional_join(table_view const& left, ast::expression const& binary_predicate, join_kind JoinKind, std::optional output_size = {}, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -68,7 +68,7 @@ std::size_t compute_conditional_join_output_size( table_view const& right, ast::expression const& binary_predicate, join_kind JoinKind, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace detail diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu index 3eb9f1b1198..7358726d69d 100644 --- a/cpp/src/join/cross_join.cu +++ b/cpp/src/join/cross_join.cu @@ -78,7 +78,7 @@ std::unique_ptr cross_join(cudf::table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::cross_join(left, right, cudf::default_stream_value, mr); + return detail::cross_join(left, right, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index bb8fc07c2d7..dbc543f4dcd 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -113,7 +113,7 @@ inner_join(table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::inner_join(left, right, compare_nulls, cudf::default_stream_value, mr); + return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr); } std::pair>, @@ -124,7 +124,7 @@ left_join(table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::left_join(left, right, compare_nulls, cudf::default_stream_value, mr); + return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr); } std::pair>, @@ -135,7 +135,7 @@ full_join(table_view const& left, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::full_join(left, right, compare_nulls, cudf::default_stream_value, mr); + return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index ec2dacaca5b..4cedfca218a 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -458,7 +458,7 @@ mixed_inner_join( compare_nulls, detail::join_kind::INNER_JOIN, output_size_data, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -479,7 +479,7 @@ std::pair>> mixed_in binary_predicate, compare_nulls, detail::join_kind::INNER_JOIN, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -504,7 +504,7 @@ mixed_left_join( compare_nulls, detail::join_kind::LEFT_JOIN, output_size_data, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -525,7 +525,7 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_JOIN, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -550,7 +550,7 @@ mixed_full_join( compare_nulls, detail::join_kind::FULL_JOIN, output_size_data, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index a9897f0f40e..6ebf3702256 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -503,7 +503,7 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -526,7 +526,7 @@ std::unique_ptr> mixed_left_semi_join( compare_nulls, detail::join_kind::LEFT_SEMI_JOIN, output_size_data, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -547,7 +547,7 @@ std::pair>> mixed_le binary_predicate, compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } @@ -570,7 +570,7 @@ std::unique_ptr> mixed_left_anti_join( compare_nulls, detail::join_kind::LEFT_ANTI_JOIN, output_size_data, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 87bac002f53..cc523b2ac7f 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -95,7 +95,7 @@ std::unique_ptr> left_semi_join( { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::default_stream_value, mr); + detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); } std::unique_ptr> left_anti_join( @@ -106,7 +106,7 @@ std::unique_ptr> left_anti_join( { CUDF_FUNC_RANGE(); return detail::left_semi_anti_join( - detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::default_stream_value, mr); + detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu index f5e35fc842f..4c3469c679e 100644 --- a/cpp/src/labeling/label_bins.cu +++ b/cpp/src/labeling/label_bins.cu @@ -244,7 +244,7 @@ std::unique_ptr label_bins(column_view const& input, left_inclusive, right_edges, right_inclusive, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index c107bad018d..496d9ee670a 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -287,7 +287,7 @@ std::unique_ptr concatenate_list_elements(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements(input, null_policy, cudf::default_stream_value, mr); + return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr); } } // namespace lists diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu index 4364470407f..8b006548391 100644 --- a/cpp/src/lists/combine/concatenate_rows.cu +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -245,7 +245,8 @@ std::unique_ptr concatenate_rows(table_view const& input, row_null_counts = row_null_counts.data()] __device__(size_t i) -> size_type { auto const row_index = i % num_rows; return row_null_counts[row_index] != num_columns; - }); + }, + stream); } // NULLIFY_OUTPUT_ROW. Output row is nullfied if any input row is null return cudf::detail::valid_if( @@ -255,7 +256,8 @@ std::unique_ptr concatenate_rows(table_view const& input, row_null_counts = row_null_counts.data()] __device__(size_t i) -> size_type { auto const row_index = i % num_rows; return row_null_counts[row_index] == 0; - }); + }, + stream); }(); concat->set_null_mask(std::move(null_mask), null_count); } @@ -307,7 +309,7 @@ std::unique_ptr concatenate_rows(table_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_rows(input, null_policy, cudf::default_stream_value, mr); + return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr); } } // namespace lists diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index 3a52426c16a..0142e736fd0 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -495,7 +495,7 @@ std::unique_ptr contains(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_key, cudf::default_stream_value, mr); + return detail::contains(lists, search_key, cudf::get_default_stream(), mr); } std::unique_ptr contains(lists_column_view const& lists, @@ -503,14 +503,14 @@ std::unique_ptr contains(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_keys, cudf::default_stream_value, mr); + return detail::contains(lists, search_keys, cudf::get_default_stream(), mr); } std::unique_ptr contains_nulls(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_nulls(lists, cudf::default_stream_value, mr); + return detail::contains_nulls(lists, cudf::get_default_stream(), mr); } std::unique_ptr index_of(lists_column_view const& lists, @@ -519,7 +519,7 @@ std::unique_ptr index_of(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_key, find_option, cudf::default_stream_value, mr); + return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr); } std::unique_ptr index_of(lists_column_view const& lists, @@ -528,7 +528,7 @@ std::unique_ptr index_of(lists_column_view const& lists, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_keys, find_option, cudf::default_stream_value, mr); + return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu index ae9fab4dda2..eda46e05f18 100644 --- a/cpp/src/lists/copying/gather.cu +++ b/cpp/src/lists/copying/gather.cu @@ -100,36 +100,17 @@ std::unique_ptr gather_list_leaf(column_view const& column, size_type gather_map_size = gd.gather_map_size; // call the normal gather - auto leaf_column = cudf::type_dispatcher( - column.type(), - cudf::detail::column_gatherer{}, - column, - gather_map_begin, - gather_map_begin + gather_map_size, - // note : we don't need to bother checking for out-of-bounds here since - // our inputs at this stage aren't coming from the user. - false, - stream, - mr); - - // the column_gatherer doesn't create the null mask because it expects - // that will be done in the gather_bitmask() step. however, gather_bitmask() - // only happens at the root level, and by definition this column is a - // leaf. so we have to generate the bitmask ourselves. - // TODO : it might make sense to expose a gather() function that takes a column_view and - // returns a column that does this work correctly. - size_type null_count = column.null_count(); - if (null_count > 0) { - auto list_cdv = column_device_view::create(column, stream); - auto validity = cudf::detail::valid_if( - gather_map_begin, - gather_map_begin + gd.gather_map_size, - [cdv = *list_cdv] __device__(int index) { return cdv.is_valid(index) ? true : false; }, - stream, - mr); - - leaf_column->set_null_mask(std::move(validity.first), validity.second); - } + // note : we don't need to bother checking for out-of-bounds here since + // our inputs at this stage aren't coming from the user. + auto gather_table = cudf::detail::gather(cudf::table_view({column}), + gather_map_begin, + gather_map_begin + gather_map_size, + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + auto leaf_column = std::move(gather_table->release().front()); + + if (column.null_count() == 0) { leaf_column->set_null_mask(rmm::device_buffer{}, 0); } return leaf_column; } diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu index cbb3aec76c5..ca7ca2f6590 100644 --- a/cpp/src/lists/copying/scatter_helper.cu +++ b/cpp/src/lists/copying/scatter_helper.cu @@ -185,7 +185,7 @@ struct list_child_constructor { mr); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(child_column->size()), child_column->mutable_view().begin(), @@ -237,7 +237,7 @@ struct list_child_constructor { auto const null_string_view = string_view{nullptr, 0}; // placeholder for factory function thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(string_views.size()), string_views.begin(), @@ -304,7 +304,7 @@ struct list_child_constructor { // For instance, if a parent list_device_view has 3 elements, it should have 3 corresponding // child list_device_view instances. thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(child_list_views.size()), child_list_views.begin(), diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu index db37a82ba8e..2c12e09bcd9 100644 --- a/cpp/src/lists/copying/segmented_gather.cu +++ b/cpp/src/lists/copying/segmented_gather.cu @@ -120,7 +120,7 @@ std::unique_ptr segmented_gather(lists_column_view const& source_column, { CUDF_FUNC_RANGE(); return detail::segmented_gather( - source_column, gather_map_list, bounds_policy, cudf::default_stream_value, mr); + source_column, gather_map_list, bounds_policy, cudf::get_default_stream(), mr); } } // namespace lists diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index 68748dfde3f..f8e7b4c6126 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -76,7 +76,7 @@ std::unique_ptr count_elements(lists_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_elements(input, cudf::default_stream_value, mr); + return detail::count_elements(input, cudf::get_default_stream(), mr); } } // namespace lists diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu index cb9cd4293b5..66134138a5c 100644 --- a/cpp/src/lists/dremel.cu +++ b/cpp/src/lists/dremel.cu @@ -192,8 +192,7 @@ dremel_data get_dremel_data(column_view h_col, } if (curr_col.type().id() == type_id::LIST) { auto child = curr_col.child(lists_column_view::child_column_index); - if ((child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8) && - output_as_byte_array) { + if (output_as_byte_array && child.type().id() == type_id::UINT8) { // consider this the bottom break; } @@ -225,6 +224,7 @@ dremel_data get_dremel_data(column_view h_col, cudf::detail::device_single_thread( [offset_at_level = d_column_offsets.data(), end_idx_at_level = d_column_ends.data(), + level_max = d_column_offsets.size(), col = *d_col] __device__() { auto curr_col = col; size_type off = curr_col.offset(); @@ -239,9 +239,11 @@ dremel_data get_dremel_data(column_view h_col, if (curr_col.type().id() == type_id::LIST) { off = curr_col.child(lists_column_view::offsets_column_index).element(off); end = curr_col.child(lists_column_view::offsets_column_index).element(end); - offset_at_level[level] = off; - end_idx_at_level[level] = end; - ++level; + if (level < level_max) { + offset_at_level[level] = off; + end_idx_at_level[level] = end; + ++level; + } curr_col = curr_col.child(lists_column_view::child_column_index); } else { curr_col = curr_col.child(0); diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu index 873b0fe408d..4db3254f201 100644 --- a/cpp/src/lists/explode.cu +++ b/cpp/src/lists/explode.cu @@ -299,7 +299,7 @@ std::unique_ptr
explode(table_view const& input_table, CUDF_FUNC_RANGE(); CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, "Unsupported non-list column"); - return detail::explode(input_table, explode_column_idx, cudf::default_stream_value, mr); + return detail::explode(input_table, explode_column_idx, cudf::get_default_stream(), mr); } /** @@ -312,7 +312,7 @@ std::unique_ptr
explode_position(table_view const& input_table, CUDF_FUNC_RANGE(); CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, "Unsupported non-list column"); - return detail::explode_position(input_table, explode_column_idx, cudf::default_stream_value, mr); + return detail::explode_position(input_table, explode_column_idx, cudf::get_default_stream(), mr); } /** @@ -326,7 +326,7 @@ std::unique_ptr
explode_outer(table_view const& input_table, CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, "Unsupported non-list column"); return detail::explode_outer( - input_table, explode_column_idx, false, cudf::default_stream_value, mr); + input_table, explode_column_idx, false, cudf::get_default_stream(), mr); } /** @@ -341,7 +341,7 @@ std::unique_ptr
explode_outer_position(table_view const& input_table, CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST, "Unsupported non-list column"); return detail::explode_outer( - input_table, explode_column_idx, true, cudf::default_stream_value, mr); + input_table, explode_column_idx, true, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu index bc04bad7c0c..d1807c2c5ac 100644 --- a/cpp/src/lists/extract.cu +++ b/cpp/src/lists/extract.cu @@ -171,7 +171,7 @@ std::unique_ptr extract_list_element(lists_column_view const& lists_colu rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::extract_list_element(lists_column, index, cudf::default_stream_value, mr); + return detail::extract_list_element(lists_column, index, cudf::get_default_stream(), mr); } /** @@ -186,7 +186,7 @@ std::unique_ptr extract_list_element(lists_column_view const& lists_colu CUDF_FUNC_RANGE(); CUDF_EXPECTS(indices.size() == lists_column.size(), "Index column must have as many elements as lists column."); - return detail::extract_list_element(lists_column, indices, cudf::default_stream_value, mr); + return detail::extract_list_element(lists_column, indices, cudf::get_default_stream(), mr); } } // namespace lists diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu index ea35977e8e4..260636a61cf 100644 --- a/cpp/src/lists/segmented_sort.cu +++ b/cpp/src/lists/segmented_sort.cu @@ -15,12 +15,8 @@ */ #include -#include #include -#include #include -#include -#include #include #include #include @@ -28,231 +24,27 @@ #include #include #include -#include #include -#include -#include #include -#include -#include #include -#include - namespace cudf { namespace lists { namespace detail { -struct SegmentedSortColumn { - /** - * @brief Compile time check for allowing radix sort for column type. - * - * Floating point is not included here because of the special handling of NaNs. - */ - template - static constexpr bool is_radix_sort_supported() - { - return std::is_integral(); - } - - template - void SortPairsAscending(KeyT const* keys_in, - KeyT* keys_out, - ValueT const* values_in, - ValueT* values_out, - int num_items, - int num_segments, - OffsetIteratorT begin_offsets, - OffsetIteratorT end_offsets, - rmm::cuda_stream_view stream) - { - rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(), - temp_storage_bytes, - keys_in, - keys_out, - values_in, - values_out, - num_items, - num_segments, - begin_offsets, - end_offsets, - 0, - sizeof(KeyT) * 8, - stream.value()); - d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; - - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(), - temp_storage_bytes, - keys_in, - keys_out, - values_in, - values_out, - num_items, - num_segments, - begin_offsets, - end_offsets, - 0, - sizeof(KeyT) * 8, - stream.value()); - } - - template - void SortPairsDescending(KeyT const* keys_in, - KeyT* keys_out, - ValueT const* values_in, - ValueT* values_out, - int num_items, - int num_segments, - OffsetIteratorT begin_offsets, - OffsetIteratorT end_offsets, - rmm::cuda_stream_view stream) - { - rmm::device_buffer d_temp_storage; - size_t temp_storage_bytes = 0; - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage.data(), - temp_storage_bytes, - keys_in, - keys_out, - values_in, - values_out, - num_items, - num_segments, - begin_offsets, - end_offsets, - 0, - sizeof(KeyT) * 8, - stream.value()); - d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream}; - - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage.data(), - temp_storage_bytes, - keys_in, - keys_out, - values_in, - values_out, - num_items, - num_segments, - begin_offsets, - end_offsets, - 0, - sizeof(KeyT) * 8, - stream.value()); - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - column_view const& child, - column_view const& segment_offsets, - order column_order, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - auto child_table = segmented_sort_by_key(table_view{{child}}, - table_view{{child}}, - segment_offsets, - {column_order}, - {null_precedence}, - stream, - mr); - return std::move(child_table->release().front()); - } - - template - std::enable_if_t(), std::unique_ptr> operator()( - column_view const& child, - column_view const& offsets, - order column_order, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - { - // the average list size at which to prefer radixsort: - constexpr cudf::size_type MIN_AVG_LIST_SIZE_FOR_RADIXSORT{100}; - - if ((child.size() / offsets.size()) < MIN_AVG_LIST_SIZE_FOR_RADIXSORT) { - auto child_table = segmented_sort_by_key(table_view{{child}}, - table_view{{child}}, - offsets, - {column_order}, - {null_precedence}, - stream, - mr); - return std::move(child_table->release().front()); - } - - auto output = - cudf::detail::allocate_like(child, child.size(), mask_allocation_policy::NEVER, stream, mr); - mutable_column_view mutable_output_view = output->mutable_view(); - - auto keys = [&]() { - if (child.nullable()) { - rmm::device_uvector keys(child.size(), stream); - auto const null_replace_T = null_precedence == null_order::AFTER - ? std::numeric_limits::max() - : std::numeric_limits::min(); - - auto device_child = column_device_view::create(child, stream); - auto keys_in = - cudf::detail::make_null_replacement_iterator(*device_child, null_replace_T); - thrust::copy_n(rmm::exec_policy(stream), keys_in, child.size(), keys.begin()); - return keys; - } - return rmm::device_uvector{0, stream}; - }(); +namespace { - std::unique_ptr sorted_indices = cudf::make_numeric_column( - data_type(type_to_id()), child.size(), mask_state::UNALLOCATED, stream, mr); - mutable_column_view mutable_indices_view = sorted_indices->mutable_view(); - thrust::sequence(rmm::exec_policy(stream), - mutable_indices_view.begin(), - mutable_indices_view.end(), - 0); - - if (column_order == order::ASCENDING) - SortPairsAscending(child.nullable() ? keys.data() : child.begin(), - mutable_output_view.begin(), - mutable_indices_view.begin(), - mutable_indices_view.begin(), - child.size(), - offsets.size() - 1, - offsets.begin(), - offsets.begin() + 1, - stream); - else - SortPairsDescending(child.nullable() ? keys.data() : child.begin(), - mutable_output_view.begin(), - mutable_indices_view.begin(), - mutable_indices_view.begin(), - child.size(), - offsets.size() - 1, - offsets.begin(), - offsets.begin() + 1, - stream); - std::vector> output_cols; - output_cols.push_back(std::move(output)); - // rearrange the null_mask. - cudf::detail::gather_bitmask(cudf::table_view{{child}}, - mutable_indices_view.begin(), - output_cols, - cudf::detail::gather_bitmask_op::DONT_CHECK, - stream, - mr); - return std::move(output_cols.front()); - } -}; - -std::unique_ptr sort_lists(lists_column_view const& input, - order column_order, - null_order null_precedence, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +/** + * @brief Create output offsets for segmented sort + * + * This creates a normalized set of offsets from the offsets child column of the input. + */ +std::unique_ptr build_output_offsets(lists_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - if (input.is_empty()) return empty_like(input.parent()); auto output_offset = make_numeric_column( input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr); thrust::transform(rmm::exec_policy(stream), @@ -262,25 +54,35 @@ std::unique_ptr sort_lists(lists_column_view const& input, [first = input.offsets_begin()] __device__(auto offset_index) { return offset_index - *first; }); - // for numeric columns, calls Faster segmented radix sort path - // for non-numeric columns, calls segmented_sort_by_key. - auto output_child = type_dispatcher(input.child().type(), - SegmentedSortColumn{}, - input.get_sliced_child(stream), - output_offset->view(), - column_order, - null_precedence, - stream, - mr); + return output_offset; +} + +} // namespace + +std::unique_ptr sort_lists(lists_column_view const& input, + order column_order, + null_order null_precedence, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.is_empty()) return empty_like(input.parent()); + + auto output_offset = build_output_offsets(input, stream, mr); + auto const child = input.get_sliced_child(stream); - auto null_mask = cudf::detail::copy_bitmask(input.parent(), stream, mr); + auto const sorted_child_table = segmented_sort_by_key(table_view{{child}}, + table_view{{child}}, + output_offset->view(), + {column_order}, + {null_precedence}, + stream, + mr); - // Assemble list column & return return make_lists_column(input.size(), std::move(output_offset), - std::move(output_child), + std::move(sorted_child_table->release().front()), input.null_count(), - std::move(null_mask), + cudf::detail::copy_bitmask(input.parent(), stream, mr), stream, mr); } @@ -293,17 +95,9 @@ std::unique_ptr stable_sort_lists(lists_column_view const& input, { if (input.is_empty()) { return empty_like(input.parent()); } - auto output_offset = make_numeric_column( - input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr); - thrust::transform(rmm::exec_policy(stream), - input.offsets_begin(), - input.offsets_end(), - output_offset->mutable_view().template begin(), - [first = input.offsets_begin()] __device__(auto offset_index) { - return offset_index - *first; - }); + auto output_offset = build_output_offsets(input, stream, mr); + auto const child = input.get_sliced_child(stream); - auto const child = input.get_sliced_child(stream); auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}}, table_view{{child}}, output_offset->view(), @@ -328,7 +122,7 @@ std::unique_ptr sort_lists(lists_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sort_lists(input, column_order, null_precedence, cudf::default_stream_value, mr); + return detail::sort_lists(input, column_order, null_precedence, cudf::get_default_stream(), mr); } std::unique_ptr stable_sort_lists(lists_column_view const& input, @@ -338,7 +132,7 @@ std::unique_ptr stable_sort_lists(lists_column_view const& input, { CUDF_FUNC_RANGE(); return detail::stable_sort_lists( - input, column_order, null_precedence, cudf::default_stream_value, mr); + input, column_order, null_precedence, cudf::get_default_stream(), mr); } } // namespace lists diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu index 6c2b0b1a785..bb0e669339a 100644 --- a/cpp/src/lists/sequences.cu +++ b/cpp/src/lists/sequences.cu @@ -214,7 +214,7 @@ std::unique_ptr sequences(column_view const& starts, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sequences(starts, sizes, cudf::default_stream_value, mr); + return detail::sequences(starts, sizes, cudf::get_default_stream(), mr); } std::unique_ptr sequences(column_view const& starts, @@ -223,7 +223,7 @@ std::unique_ptr sequences(column_view const& starts, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::sequences(starts, steps, sizes, cudf::default_stream_value, mr); + return detail::sequences(starts, steps, sizes, cudf::get_default_stream(), mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu index 00cdfcf7ff1..a31b7c6e5be 100644 --- a/cpp/src/lists/set_operations.cu +++ b/cpp/src/lists/set_operations.cu @@ -17,7 +17,7 @@ #include "utilities.hpp" #include -#include +#include #include #include #include @@ -176,9 +176,8 @@ std::unique_ptr intersect_distinct(lists_column_view const& lhs, stream, mr); - return null_count == 0 - ? std::move(output) - : cudf::detail::purge_nonempty_nulls(lists_column_view{output->view()}, stream, mr); + return null_count == 0 ? std::move(output) + : cudf::detail::purge_nonempty_nulls(output->view(), stream, mr); } std::unique_ptr union_distinct(lists_column_view const& lhs, @@ -253,9 +252,8 @@ std::unique_ptr difference_distinct(lists_column_view const& lhs, stream, mr); - return null_count == 0 - ? std::move(output) - : cudf::detail::purge_nonempty_nulls(lists_column_view{output->view()}, stream, mr); + return null_count == 0 ? std::move(output) + : cudf::detail::purge_nonempty_nulls(output->view(), stream, mr); } } // namespace detail @@ -267,7 +265,7 @@ std::unique_ptr have_overlap(lists_column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr); + return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); } std::unique_ptr intersect_distinct(lists_column_view const& lhs, @@ -278,7 +276,7 @@ std::unique_ptr intersect_distinct(lists_column_view const& lhs, { CUDF_FUNC_RANGE(); return detail::intersect_distinct( - lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr); + lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); } std::unique_ptr union_distinct(lists_column_view const& lhs, @@ -288,7 +286,7 @@ std::unique_ptr union_distinct(lists_column_view const& lhs, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr); + return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); } std::unique_ptr difference_distinct(lists_column_view const& lhs, @@ -299,7 +297,7 @@ std::unique_ptr difference_distinct(lists_column_view const& lhs, { CUDF_FUNC_RANGE(); return detail::difference_distinct( - lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr); + lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu index c99486ca8b0..c1c17dc0688 100644 --- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu +++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu @@ -104,7 +104,7 @@ std::unique_ptr apply_boolean_mask(lists_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::apply_boolean_mask(input, boolean_mask, cudf::default_stream_value, mr); + return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index c88209292de..d0e4557663e 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -78,7 +78,7 @@ std::unique_ptr distinct(lists_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::distinct(input, nulls_equal, nans_equal, cudf::default_stream_value, mr); + return detail::distinct(input, nulls_equal, nans_equal, cudf::get_default_stream(), mr); } } // namespace cudf::lists diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 91018d3f006..d9c573e8155 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -171,7 +171,7 @@ index_vector generate_merged_indices(table_view const& left_table, std::vector const& column_order, std::vector const& null_precedence, bool nullable = true, - rmm::cuda_stream_view stream = cudf::default_stream_value) + rmm::cuda_stream_view stream = cudf::get_default_stream()) { const size_type left_size = left_table.num_rows(); const size_type right_size = right_table.num_rows(); @@ -540,7 +540,7 @@ std::unique_ptr merge(std::vector const& tables_to_merg { CUDF_FUNC_RANGE(); return detail::merge( - tables_to_merge, key_cols, column_order, null_precedence, cudf::default_stream_value, mr); + tables_to_merge, key_cols, column_order, null_precedence, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 3e0cc26dcdd..cbe65354696 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -436,15 +437,13 @@ struct copy_block_partitions_dispatcher { grid_size, stream); - // Use gather instead for non-fixed width types - return type_dispatcher(input.type(), - detail::column_gatherer{}, - input, - gather_map.begin(), - gather_map.end(), - false, - stream, - mr); + auto gather_table = cudf::detail::gather(cudf::table_view({input}), + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(gather_table->release().front()); } }; @@ -610,7 +609,7 @@ std::pair, std::vector> hash_partition_table( // Use the resulting scatter map to materialize the output auto output = detail::scatter( - input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr); + input, row_partition_numbers.begin(), row_partition_numbers.end(), input, stream, mr); stream.synchronize(); // Async D2H copy must finish before returning host vec return std::pair(std::move(output), std::move(partition_offsets)); @@ -698,7 +697,7 @@ struct dispatch_map_type { // Scatter the rows into their partitions auto scattered = - cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, stream, mr); + cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, stream, mr); return std::pair(std::move(scattered), std::move(partition_offsets)); } @@ -797,7 +796,7 @@ std::pair, std::vector> partition( rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::partition(t, partition_map, num_partitions, cudf::default_stream_value, mr); + return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index d455df3e890..990992cd8f2 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -153,7 +152,7 @@ std::pair, std::vector> round_robin_part table_view const& input, cudf::size_type num_partitions, cudf::size_type start_partition = 0, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto nrows = input.num_rows(); @@ -272,7 +271,7 @@ std::pair, std::vector> round_robi { CUDF_FUNC_RANGE(); return detail::round_robin_partition( - input, num_partitions, start_partition, cudf::default_stream_value, mr); + input, num_partitions, start_partition, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu index 1fe9809d922..1f1941529c9 100644 --- a/cpp/src/quantiles/quantile.cu +++ b/cpp/src/quantiles/quantile.cu @@ -189,7 +189,7 @@ std::unique_ptr quantile(column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::quantile(input, q, interp, ordered_indices, exact, cudf::default_stream_value, mr); + return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu index c6957482f05..e3e19eaeec4 100644 --- a/cpp/src/quantiles/quantiles.cu +++ b/cpp/src/quantiles/quantiles.cu @@ -83,12 +83,12 @@ std::unique_ptr
quantiles(table_view const& input, thrust::make_counting_iterator(0), q, interp, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } else { auto sorted_idx = detail::sorted_order(input, column_order, null_precedence); return detail::quantiles( - input, sorted_idx->view().data(), q, interp, cudf::default_stream_value, mr); + input, sorted_idx->view().data(), q, interp, cudf::get_default_stream(), mr); } } @@ -109,7 +109,7 @@ std::unique_ptr
quantiles(table_view const& input, is_input_sorted, column_order, null_precedence, - cudf::default_stream_value, + cudf::get_default_stream(), mr); } diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index a11d7ab6646..0c90b0af8d2 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include + #include #include #include #include #include #include -#include #include #include @@ -42,8 +43,8 @@ using namespace cudf::tdigest; namespace cudf { -namespace detail { namespace tdigest { +namespace detail { // https://developer.nvidia.com/blog/lerp-faster-cuda/ template @@ -338,7 +339,7 @@ std::unique_ptr make_empty_tdigest_scalar(rmm::cuda_stream_view stream, std::move(*std::make_unique
(std::move(contents.children))), true, stream, mr); } -} // namespace tdigest +} // namespace detail std::unique_ptr percentile_approx(tdigest_column_view const& input, column_view const& percentiles, @@ -354,8 +355,8 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr); auto const all_empty_rows = thrust::count_if(rmm::exec_policy(stream), - input.size_begin(), - input.size_begin() + input.size(), + detail::size_begin(input), + detail::size_begin(input) + input.size(), [] __device__(auto const x) { return x == 0; }) == input.size(); auto row_size_iter = thrust::make_constant_iterator(all_empty_rows ? 0 : percentiles.size()); thrust::exclusive_scan(rmm::exec_policy(stream), @@ -379,7 +380,7 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, // uninitialized) auto [bitmask, null_count] = [stream, mr, &tdv]() { auto tdigest_is_empty = thrust::make_transform_iterator( - tdv.size_begin(), + detail::size_begin(tdv), [] __device__(size_type tdigest_size) -> size_type { return tdigest_size == 0; }); auto const null_count = thrust::reduce(rmm::exec_policy(stream), tdigest_is_empty, tdigest_is_empty + tdv.size(), 0); @@ -390,24 +391,23 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr); }(); - return cudf::make_lists_column( - input.size(), - std::move(offsets), - tdigest::compute_approx_percentiles(input, percentiles, stream, mr), - null_count, - std::move(bitmask), - stream, - mr); + return cudf::make_lists_column(input.size(), + std::move(offsets), + detail::compute_approx_percentiles(input, percentiles, stream, mr), + null_count, + std::move(bitmask), + stream, + mr); } -} // namespace detail +} // namespace tdigest std::unique_ptr percentile_approx(tdigest_column_view const& input, column_view const& percentiles, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::percentile_approx(input, percentiles, cudf::default_stream_value, mr); + return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index d870b73dff4..38c6cf7bd2e 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + #include #include #include @@ -26,7 +28,6 @@ #include #include #include -#include #include #include @@ -52,10 +53,8 @@ #include namespace cudf { -namespace detail { namespace tdigest { - -using namespace cudf::tdigest; +namespace detail { namespace { @@ -596,7 +595,7 @@ std::unique_ptr build_output_column(size_type num_rows, // if there are no stub tdigests, we can return immediately. if (num_stubs == 0) { - return cudf::detail::tdigest::make_tdigest_column(num_rows, + return cudf::tdigest::detail::make_tdigest_column(num_rows, std::move(means), std::move(weights), std::move(offsets), @@ -642,7 +641,7 @@ std::unique_ptr build_output_column(size_type num_rows, 0); // assemble final column - return cudf::detail::tdigest::make_tdigest_column(num_rows, + return cudf::tdigest::detail::make_tdigest_column(num_rows, std::move(_means), std::move(_weights), std::move(offsets), @@ -708,7 +707,7 @@ std::unique_ptr compute_tdigests(int delta, // double // max // } // - if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); } + if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } // each input group represents an individual tdigest. within each tdigest, we want the keys // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall @@ -1067,9 +1066,10 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, // generate min and max values auto merged_min_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); - auto min_iter = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(tdv.min_begin(), tdv.size_begin())), - tdigest_min{}); + auto min_iter = + thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple( + tdv.min_begin(), cudf::tdigest::detail::size_begin(tdv))), + tdigest_min{}); thrust::reduce_by_key(rmm::exec_policy(stream), group_labels, group_labels + num_group_labels, @@ -1081,9 +1081,10 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto merged_max_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); - auto max_iter = thrust::make_transform_iterator( - thrust::make_zip_iterator(thrust::make_tuple(tdv.max_begin(), tdv.size_begin())), - tdigest_max{}); + auto max_iter = + thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple( + tdv.max_begin(), cudf::tdigest::detail::size_begin(tdv))), + tdigest_max{}); thrust::reduce_by_key(rmm::exec_policy(stream), group_labels, group_labels + num_group_labels, @@ -1190,7 +1191,7 @@ std::unique_ptr reduce_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_scalar(stream, mr); } + if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); } // since this isn't coming out of a groupby, we need to sort the inputs in ascending // order with nulls at the end. @@ -1209,7 +1210,7 @@ std::unique_ptr reduce_merge_tdigest(column_view const& input, { tdigest_column_view tdv(input); - if (input.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_scalar(stream, mr); } + if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); } auto h_group_offsets = cudf::detail::make_counting_transform_iterator( 0, [size = input.size()](size_type i) { return i == 0 ? 0 : size; }); @@ -1238,7 +1239,7 @@ std::unique_ptr group_tdigest(column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); } + if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } auto const delta = max_centroids; return cudf::type_dispatcher(col.type(), @@ -1264,7 +1265,7 @@ std::unique_ptr group_merge_tdigest(column_view const& input, tdigest_column_view tdv(input); if (num_groups == 0 || input.size() == 0) { - return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); + return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); } // bring group offsets back to the host @@ -1286,6 +1287,6 @@ std::unique_ptr group_merge_tdigest(column_view const& input, mr); } -} // namespace tdigest } // namespace detail +} // namespace tdigest } // namespace cudf diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp index a86b40fd64a..cfcd21c5690 100644 --- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp +++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,13 +17,11 @@ #include #include #include -#include +#include namespace cudf { namespace tdigest { -using namespace cudf; - tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(col) { // sanity check that this is actually tdigest data diff --git a/cpp/src/quantiles/tdigest/tdigest_util.cuh b/cpp/src/quantiles/tdigest/tdigest_util.cuh new file mode 100644 index 00000000000..d0e6484875b --- /dev/null +++ b/cpp/src/quantiles/tdigest/tdigest_util.cuh @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace tdigest { +namespace detail { + +/** + * @brief Functor to compute the size of each tdigest of a column + */ +struct tdigest_size_fn { + size_type const* offsets; ///< Offsets of the t-digest column + /** + * @brief Returns size of the each tdigest in the column + * + * @param tdigest_index Index of the tdigest in the column + * @return Size of the tdigest + */ + __device__ size_type operator()(size_type tdigest_index) + { + return offsets[tdigest_index + 1] - offsets[tdigest_index]; + } +}; + +/** + * @brief Returns an iterator that returns the size of each tdigest + * in the column (each row is 1 digest) + * + * @return An iterator that returns the size of each tdigest in the column + */ +inline auto size_begin(tdigest_column_view const& tdv) +{ + return cudf::detail::make_counting_transform_iterator( + 0, tdigest_size_fn{tdv.centroids().offsets_begin()}); +} + +} // namespace detail +} // namespace tdigest +} // namespace cudf diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index e69942552ff..603e13c1894 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -277,7 +277,7 @@ std::pair, std::unique_ptr> minmax( const column_view& col, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::minmax(col, cudf::default_stream_value, mr); + return detail::minmax(col, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 523865e0df0..38db7eb3e89 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -49,7 +49,7 @@ struct reduce_dispatch_functor { } template - std::unique_ptr operator()(std::unique_ptr const& agg) + std::unique_ptr operator()(reduce_aggregation const& agg) { switch (k) { case aggregation::SUM: return reduction::sum(col, output_dtype, init, stream, mr); @@ -62,12 +62,12 @@ struct reduce_dispatch_functor { return reduction::sum_of_squares(col, output_dtype, stream, mr); case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr); case aggregation::VARIANCE: { - auto var_agg = dynamic_cast(agg.get()); - return reduction::variance(col, output_dtype, var_agg->_ddof, stream, mr); + auto var_agg = static_cast(agg); + return reduction::variance(col, output_dtype, var_agg._ddof, stream, mr); } case aggregation::STD: { - auto var_agg = dynamic_cast(agg.get()); - return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr); + auto var_agg = static_cast(agg); + return reduction::standard_deviation(col, output_dtype, var_agg._ddof, stream, mr); } case aggregation::MEDIAN: { auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream); @@ -78,60 +78,59 @@ struct reduce_dispatch_functor { return get_element(*col_ptr, 0, stream, mr); } case aggregation::QUANTILE: { - auto quantile_agg = dynamic_cast(agg.get()); - CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1, + auto quantile_agg = static_cast(agg); + CUDF_EXPECTS(quantile_agg._quantiles.size() == 1, "Reduction quantile accepts only one quantile value"); auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream); auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()}, stream)[0]; auto col_ptr = quantile(col, - quantile_agg->_quantiles, - quantile_agg->_interpolation, + quantile_agg._quantiles, + quantile_agg._interpolation, valid_sorted_indices, true, stream); return get_element(*col_ptr, 0, stream, mr); } case aggregation::NUNIQUE: { - auto nunique_agg = dynamic_cast(agg.get()); + auto nunique_agg = static_cast(agg); return make_fixed_width_scalar( - detail::distinct_count( - col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream), + detail::distinct_count(col, nunique_agg._null_handling, nan_policy::NAN_IS_VALID, stream), stream, mr); } case aggregation::NTH_ELEMENT: { - auto nth_agg = dynamic_cast(agg.get()); - return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr); + auto nth_agg = static_cast(agg); + return reduction::nth_element(col, nth_agg._n, nth_agg._null_handling, stream, mr); } case aggregation::COLLECT_LIST: { - auto col_agg = dynamic_cast(agg.get()); - return reduction::collect_list(col, col_agg->_null_handling, stream, mr); + auto col_agg = static_cast(agg); + return reduction::collect_list(col, col_agg._null_handling, stream, mr); } case aggregation::COLLECT_SET: { - auto col_agg = dynamic_cast(agg.get()); + auto col_agg = static_cast(agg); return reduction::collect_set( - col, col_agg->_null_handling, col_agg->_nulls_equal, col_agg->_nans_equal, stream, mr); + col, col_agg._null_handling, col_agg._nulls_equal, col_agg._nans_equal, stream, mr); } case aggregation::MERGE_LISTS: { return reduction::merge_lists(col, stream, mr); } case aggregation::MERGE_SETS: { - auto col_agg = dynamic_cast(agg.get()); - return reduction::merge_sets(col, col_agg->_nulls_equal, col_agg->_nans_equal, stream, mr); + auto col_agg = static_cast(agg); + return reduction::merge_sets(col, col_agg._nulls_equal, col_agg._nans_equal, stream, mr); } case aggregation::TDIGEST: { CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT, "Tdigest aggregations expect output type to be STRUCT"); - auto td_agg = dynamic_cast(agg.get()); - return detail::tdigest::reduce_tdigest(col, td_agg->max_centroids, stream, mr); + auto td_agg = static_cast(agg); + return tdigest::detail::reduce_tdigest(col, td_agg.max_centroids, stream, mr); } case aggregation::MERGE_TDIGEST: { CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT, "Tdigest aggregations expect output type to be STRUCT"); - auto td_agg = dynamic_cast(agg.get()); - return detail::tdigest::reduce_merge_tdigest(col, td_agg->max_centroids, stream, mr); + auto td_agg = static_cast(agg); + return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr); } default: CUDF_FAIL("Unsupported reduction operator"); } @@ -140,25 +139,25 @@ struct reduce_dispatch_functor { std::unique_ptr reduce( column_view const& col, - std::unique_ptr const& agg, + reduce_aggregation const& agg, data_type output_dtype, std::optional> init, - rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(), "column and initial value must be the same type"); - if (init.has_value() && !(agg->kind == aggregation::SUM || agg->kind == aggregation::PRODUCT || - agg->kind == aggregation::MIN || agg->kind == aggregation::MAX || - agg->kind == aggregation::ANY || agg->kind == aggregation::ALL)) { + if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || + agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || + agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) { CUDF_FAIL( "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); } // Returns default scalar if input column is non-valid. In terms of nested columns, we need to // handcraft the default scalar with input column. if (col.size() <= col.null_count()) { - if (agg->kind == aggregation::TDIGEST || agg->kind == aggregation::MERGE_TDIGEST) { - return detail::tdigest::make_empty_tdigest_scalar(); + if (agg.kind == aggregation::TDIGEST || agg.kind == aggregation::MERGE_TDIGEST) { + return tdigest::detail::make_empty_tdigest_scalar(stream); } if (col.type().id() == type_id::EMPTY || col.type() != output_dtype) { // Under some circumstance, the output type will become the List of input type, @@ -176,26 +175,26 @@ std::unique_ptr reduce( } return aggregation_dispatcher( - agg->kind, reduce_dispatch_functor{col, output_dtype, init, stream, mr}, agg); + agg.kind, reduce_dispatch_functor{col, output_dtype, init, stream, mr}, agg); } } // namespace detail std::unique_ptr reduce(column_view const& col, - std::unique_ptr const& agg, + reduce_aggregation const& agg, data_type output_dtype, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::reduce(col, agg, output_dtype, std::nullopt, cudf::default_stream_value, mr); + return detail::reduce(col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr); } std::unique_ptr reduce(column_view const& col, - std::unique_ptr const& agg, + reduce_aggregation const& agg, data_type output_dtype, std::optional> init, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::reduce(col, agg, output_dtype, init, cudf::default_stream_value, mr); + return detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index 85c0f7ea13f..2871ee283ba 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -25,16 +25,16 @@ namespace cudf { namespace detail { std::unique_ptr scan(column_view const& input, - std::unique_ptr const& agg, + scan_aggregation const& agg, scan_type inclusive, null_policy null_handling, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (agg->kind == aggregation::RANK) { + if (agg.kind == aggregation::RANK) { CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, "Rank aggregation operator requires an inclusive scan"); - auto const& rank_agg = dynamic_cast(*agg); + auto const& rank_agg = static_cast(agg); if (rank_agg._method == rank_method::MIN) { if (rank_agg._percentage == rank_percentage::NONE) { return inclusive_rank_scan(input, stream, mr); @@ -55,13 +55,13 @@ std::unique_ptr scan(column_view const& input, } // namespace detail std::unique_ptr scan(column_view const& input, - std::unique_ptr const& agg, + scan_aggregation const& agg, scan_type inclusive, null_policy null_handling, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::scan(input, agg, inclusive, null_handling, cudf::default_stream_value, mr); + return detail::scan(input, agg, inclusive, null_handling, cudf::get_default_stream(), mr); } } // namespace cudf diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh index 127f2ae95b4..2ad6124cdd0 100644 --- a/cpp/src/reductions/scan/scan.cuh +++ b/cpp/src/reductions/scan/scan.cuh @@ -35,12 +35,12 @@ rmm::device_buffer mask_scan(column_view const& input_view, template