From 59b09fd097e39bd15646eac1156889692974dc5f Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 29 Sep 2023 11:10:25 -0500 Subject: [PATCH 01/12] cuDF: Build CUDA 12.0 ARM conda packages. (#14112) This PR builds conda packages using CUDA 12 on ARM. This work is targeting 23.12 and depends on https://github.com/rapidsai/rmm/pull/1330. Closes #14128. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/14112 --- .github/workflows/build.yaml | 16 ++++++++-------- .github/workflows/pr.yaml | 28 ++++++++++++++-------------- .github/workflows/test.yaml | 16 ++++++++-------- dependencies.yaml | 20 ++------------------ 4 files changed, 32 insertions(+), 48 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ab028eb89cc..dc2c81d1c77 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: ${{ inputs.build_type || 'branch' }} @@ -100,7 +100,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 214f9c90b41..047b80f2e5c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -26,34 +26,34 @@ jobs: - wheel-build-dask-cudf - wheel-tests-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -61,14 +61,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -78,7 +78,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -88,7 +88,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -98,21 +98,21 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request @@ -120,7 +120,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9ca32bcfe03..e58227c30dc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -36,7 +36,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: nightly diff --git a/dependencies.yaml b/dependencies.yaml index c8ee66bd99f..c19e8765be3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -227,25 +227,9 @@ dependencies: # in sync with the version pinned in get_arrow.cmake. - libarrow==12.0.1.* - librdkafka>=1.9.0,<1.10.0a0 + # Align nvcomp version with rapids-cmake + - nvcomp==2.6.1 - spdlog>=1.11.0,<1.12 - specific: - - output_types: conda - matrices: - - matrix: - arch: x86_64 - packages: - # Align nvcomp version with rapids-cmake - # TODO: not yet available for aarch64 CUDA 12 - - &nvcomp nvcomp==2.6.1 - - matrix: - arch: aarch64 - cuda: "11.8" - packages: - - *nvcomp - # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3, - # all CUDA/arch combinations should be supported by existing packages. - - matrix: - packages: build_wheels: common: - output_types: pyproject From 66a655ce80e8b0accb80ea4e23799d23a82a35a2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 3 Oct 2023 08:00:44 -0500 Subject: [PATCH 02/12] Fix inaccuracy in decimal128 rounding. (#14233) Fixes a bug where floating-point values were used in decimal128 rounding, giving wrong results. Closes https://github.com/rapidsai/cudf/issues/14210. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Divye Gala (https://github.com/divyegala) - Mark Harris (https://github.com/harrism) --- cpp/src/round/round.cu | 5 ++- cpp/tests/round/round_tests.cpp | 79 +++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 4b3f80fc6e2..41cce57d55b 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -271,7 +271,10 @@ std::unique_ptr round_with(column_view const& input, out_view.template end(), static_cast(0)); } else { - Type const n = std::pow(10, scale_movement); + Type n = 10; + for (int i = 1; i < scale_movement; ++i) { + n *= 10; + } thrust::transform(rmm::exec_policy(stream), input.begin(), input.end(), diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp index d802c0c2706..f97bb7a5323 100644 --- a/cpp/tests/round/round_tests.cpp +++ b/cpp/tests/round/round_tests.cpp @@ -703,4 +703,83 @@ TEST_F(RoundTests, BoolTestHalfUp) EXPECT_THROW(cudf::round(input, -2, cudf::rounding_method::HALF_UP), cudf::logic_error); } +// Use __uint128_t for demonstration. +constexpr __uint128_t operator""_uint128_t(const char* s) +{ + __uint128_t ret = 0; + for (int i = 0; s[i] != '\0'; ++i) { + ret *= 10; + if ('0' <= s[i] && s[i] <= '9') { ret += s[i] - '0'; } + } + return ret; +} + +TEST_F(RoundTests, HalfEvenErrorsA) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.5 at scale -37 should round HALF_EVEN to 0, because 0 is an even number + auto const input = + fp_wrapper{{5000000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{0}, scale_type{0}}; + auto const result = cudf::round(input, 0, cudf::rounding_method::HALF_EVEN); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(RoundTests, HalfEvenErrorsB) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.125 at scale -37 should round HALF_EVEN to 0.12, because 2 is an even number + auto const input = + fp_wrapper{{1250000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{12}, scale_type{-2}}; + auto const result = cudf::round(input, 2, cudf::rounding_method::HALF_EVEN); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(RoundTests, HalfEvenErrorsC) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.0625 at scale -37 should round HALF_EVEN to 0.062, because 2 is an even number + auto const input = + fp_wrapper{{0625000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{62}, scale_type{-3}}; + auto const result = cudf::round(input, 3, cudf::rounding_method::HALF_EVEN); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(RoundTests, HalfUpErrorsA) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.25 at scale -37 should round HALF_UP to 0.3 + auto const input = + fp_wrapper{{2500000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{3}, scale_type{-1}}; + auto const result = cudf::round(input, 1, cudf::rounding_method::HALF_UP); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() From 3964950ba2fecf7f962917276058a6381d396246 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 3 Oct 2023 15:11:15 -0500 Subject: [PATCH 03/12] Fix inaccurate ceil/floor and inaccurate rescaling casts of fixed-point values. (#14242) This is a follow-up PR to #14233. This PR fixes a bug where floating-point values were used as intermediates in ceil/floor unary operations and cast operations that require rescaling for fixed-point types, giving inaccurate results. See also: - https://github.com/rapidsai/cudf/pull/14233#discussion_r1340786769 - https://github.com/rapidsai/cudf/issues/14243 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Vukasin Milovanovic (https://github.com/vuule) --- cpp/src/unary/cast_ops.cu | 8 +++++- cpp/src/unary/math_ops.cu | 8 ++++-- cpp/tests/unary/cast_tests.cpp | 40 ++++++++++++++++++++++++++++++ cpp/tests/unary/unary_ops_test.cpp | 33 ++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 3 deletions(-) diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index f40ace8d10b..1c81f266200 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -199,7 +199,13 @@ std::unique_ptr rescale(column_view input, } return output_column; } - auto const scalar = make_fixed_point_scalar(std::pow(10, -diff), scale_type{diff}, stream); + + RepType scalar_value = 10; + for (int i = 1; i < -diff; ++i) { + scalar_value *= 10; + } + + auto const scalar = make_fixed_point_scalar(scalar_value, scale_type{diff}, stream); return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr); } }; diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index 961f3a9e720..d0cae81a9c8 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -295,7 +295,11 @@ std::unique_ptr unary_op_with(column_view const& input, input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); auto out_view = result->mutable_view(); - Type const n = std::pow(10, -input.type().scale()); + + Type n = 10; + for (int i = 1; i < -input.type().scale(); ++i) { + n *= 10; + } thrust::transform(rmm::exec_policy(stream), input.begin(), diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index 9506e1918c0..d565359a4ea 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -30,6 +30,8 @@ #include #include +#include + #include #include @@ -967,6 +969,44 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointTests, ValidateCastRescalePrecision) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + // This test is designed to protect against floating point conversion + // introducing errors in fixed-point arithmetic. The rescaling that occurs + // during casting to different scales should only use fixed-precision math. + // Realistically, we are only able to show precision failures due to floating + // conversion in a few very specific circumstances where dividing by specific + // powers of 10 works against us. Some examples: 10^23, 10^25, 10^26, 10^27, + // 10^30, 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation. + // For completeness and to ensure that we are not missing any other cases, we + // test casting to/from all scales in the range of each decimal type. Values + // that are powers of ten show this error more readily than non-powers of 10 + // because the rescaling factor is a power of 10, meaning that errors in + // division are more visible. + constexpr auto min_scale = -cuda::std::numeric_limits::digits10; + for (int input_scale = 0; input_scale >= min_scale; --input_scale) { + for (int result_scale = 0; result_scale >= min_scale; --result_scale) { + RepType input_value = 1; + for (int k = 0; k > input_scale; --k) { + input_value *= 10; + } + RepType result_value = 1; + for (int k = 0; k > result_scale; --k) { + result_value *= 10; + } + auto const input = fp_wrapper{{input_value}, scale_type{input_scale}}; + auto const expected = fp_wrapper{{result_value}, scale_type{result_scale}}; + auto const result = cudf::cast(input, make_fixed_point_data_type(result_scale)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + } +} + TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask) { using namespace numeric; diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp index 49764f22373..76d1f769856 100644 --- a/cpp/tests/unary/unary_ops_test.cpp +++ b/cpp/tests/unary/unary_ops_test.cpp @@ -24,6 +24,8 @@ #include +#include + template cudf::test::fixed_width_column_wrapper create_fixed_columns(cudf::size_type start, cudf::size_type size, @@ -372,4 +374,35 @@ TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryFloorLarge) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointUnaryTests, ValidateCeilFloorPrecision) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + // This test is designed to protect against floating point conversion + // introducing errors in fixed-point arithmetic. The rounding that occurs + // during ceil/floor should only use fixed-precision math. Realistically, + // we are only able to show precision failures due to floating conversion in + // a few very specific circumstances where dividing by specific powers of 10 + // works against us. Some examples: 10^23, 10^25, 10^26, 10^27, 10^30, + // 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation. For + // completeness and to ensure that we are not missing any other cases, we + // test all scales representable in the range of each decimal type. + constexpr auto min_scale = -cuda::std::numeric_limits::digits10; + for (int input_scale = 0; input_scale >= min_scale; --input_scale) { + RepType input_value = 1; + for (int k = 0; k > input_scale; --k) { + input_value *= 10; + } + auto const input = fp_wrapper{{input_value}, scale_type{input_scale}}; + auto const expected = fp_wrapper{{input_value}, scale_type{input_scale}}; + auto const ceil_result = cudf::unary_operation(input, cudf::unary_operator::CEIL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, ceil_result->view()); + auto const floor_result = cudf::unary_operation(input, cudf::unary_operator::FLOOR); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, floor_result->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() From 29556a2514f4d274164a27a80539410da7e132d6 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 3 Oct 2023 14:44:28 -0700 Subject: [PATCH 04/12] Remove the use of volatile in ORC (#14175) `volatile` should no be required in our code, unless there are compiler or synchronization issues. This PR removes the use in ORC reader and writer. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14175 --- cpp/src/io/orc/stats_enc.cu | 4 +- cpp/src/io/orc/stripe_data.cu | 82 +++++++++++++++-------------------- cpp/src/io/orc/stripe_enc.cu | 14 +++--- cpp/src/io/orc/stripe_init.cu | 2 +- 4 files changed, 46 insertions(+), 56 deletions(-) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 95f1db5bfd1..479a2dfada3 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -76,8 +76,8 @@ __global__ void __launch_bounds__(block_size, 1) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage temp_storage; - volatile uint32_t stats_size = 0; - auto t = threadIdx.x; + uint32_t stats_size = 0; + auto t = threadIdx.x; __syncthreads(); for (thread_index_type start = 0; start < statistics_count; start += block_size) { uint32_t stats_len = 0, stats_pos; diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 3edcd3d83b2..0b249bbdafe 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -142,9 +142,7 @@ struct orcdec_state_s { * @param[in] base Pointer to raw byte stream data * @param[in] len Stream length in bytes */ -static __device__ void bytestream_init(volatile orc_bytestream_s* bs, - uint8_t const* base, - uint32_t len) +static __device__ void bytestream_init(orc_bytestream_s* bs, uint8_t const* base, uint32_t len) { uint32_t pos = (len > 0) ? static_cast(7 & reinterpret_cast(base)) : 0; bs->base = base - pos; @@ -160,8 +158,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s* bs, * @param[in] bs Byte stream input * @param[in] bytes_consumed Number of bytes that were consumed */ -static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs, - uint32_t bytes_consumed) +static __device__ void bytestream_flush_bytes(orc_bytestream_s* bs, uint32_t bytes_consumed) { uint32_t pos = bs->pos; uint32_t len = bs->len; @@ -197,7 +194,7 @@ static __device__ void bytestream_fill(orc_bytestream_s* bs, int t) * @param[in] pos Position in byte stream * @return byte */ -inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint8_t bytestream_readbyte(orc_bytestream_s* bs, int pos) { return bs->buf.u8[pos & (bytestream_buffer_size - 1)]; } @@ -209,7 +206,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int * @param[in] pos Position in byte stream * @result bits */ -inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint32_t bytestream_readu32(orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -224,7 +221,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int * @param[in] numbits number of bits * @return bits */ -inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint64_t bytestream_readu64(orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -245,9 +242,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs, - int bitpos, - uint32_t numbits) +inline __device__ uint32_t bytestream_readbits(orc_bytestream_s* bs, int bitpos, uint32_t numbits) { int idx = bitpos >> 5; uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123); @@ -263,9 +258,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs, - int bitpos, - uint32_t numbits) +inline __device__ uint64_t bytestream_readbits64(orc_bytestream_s* bs, int bitpos, uint32_t numbits) { int idx = bitpos >> 5; uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123); @@ -288,7 +281,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, uint32_t& result) @@ -304,7 +297,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, int32_t& result) @@ -321,7 +314,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, uint64_t& result) @@ -337,7 +330,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, int64_t& result) @@ -354,7 +347,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @return length of varint in bytes */ template -inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint32_t varint_length(orc_bytestream_s* bs, int pos) { if (bytestream_readbyte(bs, pos) > 0x7f) { uint32_t next32 = bytestream_readu32(bs, pos + 1); @@ -392,7 +385,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) * @return new position in byte stream buffer */ template -inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result) +inline __device__ int decode_base128_varint(orc_bytestream_s* bs, int pos, T& result) { uint32_t v = bytestream_readbyte(bs, pos++); if (v > 0x7f) { @@ -446,7 +439,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p /** * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals) */ -inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos) +inline __device__ __int128_t decode_varint128(orc_bytestream_s* bs, int pos) { auto byte = bytestream_readbyte(bs, pos++); __int128_t const sign_mask = -(int32_t)(byte & 1); @@ -463,7 +456,7 @@ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int /** * @brief Decodes an unsigned 32-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -474,7 +467,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint /** * @brief Decodes an unsigned 64-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -485,7 +478,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint /** * @brief Signed version of 32-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -496,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int3 /** * @brief Signed version of 64-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -514,7 +507,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int6 * @return number of values decoded */ template -inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -534,8 +527,8 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, * @return number of values decoded */ template -static __device__ uint32_t Integer_RLEv1( - orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t) +static __device__ uint32_t +Integer_RLEv1(orc_bytestream_s* bs, orc_rlev1_state_s* rle, T* vals, uint32_t maxvals, int t) { uint32_t numvals, numruns; if (t == 0) { @@ -642,8 +635,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { */ template static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, - volatile orc_rlev2_state_s* rle, - volatile T* vals, + orc_rlev2_state_s* rle, + T* vals, uint32_t maxvals, int t, bool has_buffered_values = false) @@ -883,7 +876,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, * * @return 32-bit value */ -inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos) +inline __device__ uint32_t rle8_read_bool32(uint32_t* vals, uint32_t bitpos) { uint32_t a = vals[(bitpos >> 5) + 0]; uint32_t b = vals[(bitpos >> 5) + 1]; @@ -903,11 +896,8 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bi * * @return number of values decoded */ -static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs, - volatile orc_byterle_state_s* rle, - volatile uint8_t* vals, - uint32_t maxvals, - int t) +static __device__ uint32_t +Byte_RLE(orc_bytestream_s* bs, orc_byterle_state_s* rle, uint8_t* vals, uint32_t maxvals, int t) { uint32_t numvals, numruns; int r, tr; @@ -1006,8 +996,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1, * @return number of values decoded */ static __device__ int Decode_Decimals(orc_bytestream_s* bs, - volatile orc_byterle_state_s* scratch, - volatile orcdec_state_s::values& vals, + orc_byterle_state_s* scratch, + orcdec_state_s::values& vals, int val_scale, int numvals, type_id dtype_id, @@ -1241,8 +1231,8 @@ __global__ void __launch_bounds__(block_size) } __syncthreads(); while (s->top.dict.dict_len > 0) { - uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len; - volatile uint32_t* vals = s->vals.u32; + uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len; + uint32_t* vals = s->vals.u32; bytestream_fill(&s->bs, t); __syncthreads(); if (is_rlev1(s->chunk.encoding_kind)) { @@ -1310,12 +1300,12 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s, min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x)); if (s->chunk.valid_map_base != nullptr) { // We have a present stream - uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); - auto r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); - uint32_t valid = (t < nrows && r < rmax) - ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 - : 0; - volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; + uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); + auto r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); + uint32_t valid = (t < nrows && r < rmax) + ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 + : 0; + auto* row_ofs_plus1 = (uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row; if (t < nrows) { row_ofs_plus1[t] = valid; } lengths_to_positions(row_ofs_plus1, nrows, t); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 73c41e2bbcd..4841fb1141a 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -53,7 +53,7 @@ constexpr bool zero_pll_war = true; struct byterle_enc_state_s { uint32_t literal_run; uint32_t repeat_run; - volatile uint32_t rpt_map[(512 / 32) + 1]; + uint32_t rpt_map[(512 / 32) + 1]; }; struct intrle_enc_state_s { @@ -63,7 +63,7 @@ struct intrle_enc_state_s { uint32_t literal_w; uint32_t hdr_bytes; uint32_t pl_bytes; - volatile uint32_t delta_map[(512 / 32) + 1]; + uint32_t delta_map[(512 / 32) + 1]; }; struct strdata_enc_state_s { @@ -366,7 +366,7 @@ static __device__ uint32_t IntegerRLE( using block_reduce = cub::BlockReduce; uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; uint32_t out_cnt = 0; - __shared__ volatile uint64_t block_vmin; + __shared__ uint64_t block_vmin; while (numvals > 0) { T v0 = (t < numvals) ? inbuf[(inpos + t) & inmask] : 0; @@ -615,7 +615,7 @@ static __device__ void StoreStringData(uint8_t* dst, * @param[in] t thread id */ template -inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -1143,7 +1143,7 @@ __global__ void __launch_bounds__(256) uint32_t comp_block_align) { __shared__ __align__(16) StripeStream ss; - __shared__ uint8_t* volatile uncomp_base_g; + __shared__ uint8_t* uncomp_base_g; auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align); auto const padded_comp_block_size = util::round_up_unsafe(max_comp_blk_size, comp_block_align); @@ -1196,8 +1196,8 @@ __global__ void __launch_bounds__(1024) uint32_t max_comp_blk_size) { __shared__ __align__(16) StripeStream ss; - __shared__ uint8_t const* volatile comp_src_g; - __shared__ uint32_t volatile comp_len_g; + __shared__ uint8_t const* comp_src_g; + __shared__ uint32_t comp_len_g; auto const stripe_id = blockIdx.x; auto const stream_id = blockIdx.y; diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 8eeca504121..b31a4a081d1 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -499,7 +499,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row; for (int j = t4; j < rowgroup_size4; j += 4) { ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] = - ((volatile uint32_t*)&s->rowgroups[i])[j]; + ((uint32_t*)&s->rowgroups[i])[j]; } row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows; // Updating in case of struct From d87e181daa67d8fb1a029fc2c09e2f561d1e7234 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 4 Oct 2023 13:25:56 -0700 Subject: [PATCH 05/12] Expose streams in binaryop APIs (#14187) Contributes to #925 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/14187 --- cpp/include/cudf/binaryop.hpp | 8 ++ cpp/src/binaryop/binaryop.cpp | 12 ++- cpp/src/binaryop/compiled/binary_ops.cu | 6 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/binaryop_test.cpp | 126 ++++++++++++++++++++++++ 5 files changed, 147 insertions(+), 6 deletions(-) create mode 100644 cpp/tests/streams/binaryop_test.cpp diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index 77d6a4d1e89..9df4b4eb00f 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -102,6 +102,7 @@ enum class binary_operator : int32_t { * @param rhs The right operand column * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -115,6 +116,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -131,6 +133,7 @@ std::unique_ptr binary_operation( * @param rhs The right operand scalar * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -144,6 +147,7 @@ std::unique_ptr binary_operation( scalar const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -158,6 +162,7 @@ std::unique_ptr binary_operation( * @param rhs The right operand column * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -172,6 +177,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -189,6 +195,7 @@ std::unique_ptr binary_operation( * @param output_type The desired data type of the output column. It is assumed * that output_type is compatible with the output data type * of the function in the PTX code + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -201,6 +208,7 @@ std::unique_ptr binary_operation( column_view const& rhs, std::string const& ptx, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index ef07de8c461..6b413ab2be4 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -405,38 +405,42 @@ std::unique_ptr binary_operation(scalar const& lhs, column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, scalar const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, std::string const& ptx, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr); } } // namespace cudf diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 1f7f342632a..85ab5c6d6cb 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -47,14 +47,16 @@ namespace { struct scalar_as_column_view { using return_type = typename std::pair>; template ())> - return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) + return_type operator()(scalar const& s, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource*) { auto& h_scalar_type_view = static_cast&>(const_cast(s)); auto col_v = column_view(s.type(), 1, h_scalar_type_view.data(), reinterpret_cast(s.validity_data()), - !s.is_valid()); + !s.is_valid(stream)); return std::pair{col_v, std::unique_ptr(nullptr)}; } template ())> diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 04939f3cd6d..ac13c121530 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -622,6 +622,7 @@ ConfigureTest( STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu ) +ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp new file mode 100644 index 00000000000..2520aed0458 --- /dev/null +++ b/cpp/tests/streams/binaryop_test.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +class BinaryopTest : public cudf::test::BaseFixture {}; + +TEST_F(BinaryopTest, ColumnColumn) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +TEST_F(BinaryopTest, ColumnScalar) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::numeric_scalar rhs{23, true, cudf::test::get_default_stream()}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +TEST_F(BinaryopTest, ScalarColumn) +{ + cudf::numeric_scalar lhs{42, true, cudf::test::get_default_stream()}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +class BinaryopPTXTest : public BinaryopTest { + protected: + void SetUp() override + { + if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; } + } +}; + +TEST_F(BinaryopPTXTest, ColumnColumnPTX) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + // c = a*a*a + b*b + char const* ptx = + R"***( +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-24817639 +// Cuda compilation tools, release 10.0, V10.0.130 +// Based on LLVM 3.4svn +// + +.version 6.3 +.target sm_70 +.address_size 64 + + // .globl _ZN8__main__7add$241Eix +.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Eix; +.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Exx; + +.visible .func (.param .b32 func_retval0) _ZN8__main__7add$241Eix( + .param .b64 _ZN8__main__7add$241Eix_param_0, + .param .b32 _ZN8__main__7add$241Eix_param_1, + .param .b64 _ZN8__main__7add$241Eix_param_2 +) +{ + .reg .b32 %r<3>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [_ZN8__main__7add$241Eix_param_0]; + ld.param.u32 %r1, [_ZN8__main__7add$241Eix_param_1]; + ld.param.u64 %rd2, [_ZN8__main__7add$241Eix_param_2]; + cvt.s64.s32 %rd3, %r1; + mul.wide.s32 %rd4, %r1, %r1; + mul.lo.s64 %rd5, %rd4, %rd3; + mul.lo.s64 %rd6, %rd2, %rd2; + add.s64 %rd7, %rd6, %rd5; + st.u64 [%rd1], %rd7; + mov.u32 %r2, 0; + st.param.b32 [func_retval0+0], %r2; + ret; +} + +)***"; + + cudf::binary_operation( + lhs, rhs, ptx, cudf::data_type(cudf::type_to_id()), cudf::test::get_default_stream()); + cudf::binary_operation(lhs, rhs, ptx, cudf::data_type(cudf::type_to_id())); +} From b120f7e73e882b4eaa6b5a2cb91aeed20bf1198d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 4 Oct 2023 14:23:24 -0700 Subject: [PATCH 06/12] Improve `contains_column` by invoking `contains_table` (#14238) Part of #https://github.com/rapidsai/cudf/issues/12261 This PR simplifies the `contains_column` implementation by invoking `contains_table` and gets rid of the use of the cudf `unordered_multiset`. It also removes the `unordered_multiset` header file from libcudf. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14238 --- cpp/src/hash/unordered_multiset.cuh | 159 ---------------------------- cpp/src/search/contains_column.cu | 67 +----------- 2 files changed, 1 insertion(+), 225 deletions(-) delete mode 100644 cpp/src/hash/unordered_multiset.cuh diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh deleted file mode 100644 index 183042fc0f4..00000000000 --- a/cpp/src/hash/unordered_multiset.cuh +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include - -namespace cudf { -namespace detail { -/* - * Device view of the unordered multiset - */ -template , - typename Equality = equal_to> -class unordered_multiset_device_view { - public: - unordered_multiset_device_view(size_type hash_size, - size_type const* hash_begin, - Element const* hash_data) - : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals() - { - } - - bool __device__ contains(Element e) const - { - size_type loc = hasher(e) % (2 * hash_size); - - for (size_type i = hash_begin[loc]; i < hash_begin[loc + 1]; ++i) { - if (equals(hash_data[i], e)) return true; - } - - return false; - } - - private: - Hasher hasher; - Equality equals; - size_type hash_size; - size_type const* hash_begin; - Element const* hash_data; -}; - -/* - * Fixed size set on a device. - */ -template , - typename Equality = equal_to> -class unordered_multiset { - public: - /** - * @brief Factory to construct a new unordered_multiset - */ - static unordered_multiset create(column_view const& col, rmm::cuda_stream_view stream) - { - auto d_column = column_device_view::create(col, stream); - auto d_col = *d_column; - - auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async( - 2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource()); - auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async( - 2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource()); - auto hash_data = rmm::device_uvector(d_col.size(), stream); - - Hasher hasher; - size_type* d_hash_bins_start = hash_bins_start.data(); - size_type* d_hash_bins_end = hash_bins_end.data(); - Element* d_hash_data = hash_data.data(); - - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_start, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - cuda::atomic_ref ref{*(d_hash_bins_start + tmp)}; - ref.fetch_add(1, cuda::std::memory_order_relaxed); - } - }); - - thrust::exclusive_scan(rmm::exec_policy(stream), - hash_bins_start.begin(), - hash_bins_start.end(), - hash_bins_end.begin()); - - thrust::copy(rmm::exec_policy(stream), - hash_bins_end.begin(), - hash_bins_end.end(), - hash_bins_start.begin()); - - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - cuda::atomic_ref ref{*(d_hash_bins_end + tmp)}; - size_type offset = ref.fetch_add(1, cuda::std::memory_order_relaxed); - d_hash_data[offset] = e; - } - }); - - return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data)); - } - - unordered_multiset_device_view to_device() const - { - return unordered_multiset_device_view( - size, hash_bins.data(), hash_data.data()); - } - - private: - unordered_multiset(size_type size, - rmm::device_uvector&& hash_bins, - rmm::device_uvector&& hash_data) - : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)} - { - } - - size_type size; - rmm::device_uvector hash_bins; - rmm::device_uvector hash_data; -}; - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu index 4363bd212fe..85971647434 100644 --- a/cpp/src/search/contains_column.cu +++ b/cpp/src/search/contains_column.cu @@ -14,23 +14,14 @@ * limitations under the License. */ -#include - -#include #include #include #include #include #include #include -#include #include -#include - -#include -#include -#include namespace cudf { namespace detail { @@ -38,61 +29,7 @@ namespace detail { namespace { struct contains_column_dispatch { - template - struct contains_fn { - bool __device__ operator()(size_type const idx) const - { - if (needles_have_nulls && needles.is_null_nocheck(idx)) { - // Exit early. The value doesn't matter, and will be masked as a null element. - return true; - } - - return haystack.contains(needles.template element(idx)); - } - - Haystack const haystack; - column_device_view const needles; - bool const needles_have_nulls; - }; - - template ())> - std::unique_ptr operator()(column_view const& haystack, - column_view const& needles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - auto result = make_numeric_column(data_type{type_to_id()}, - needles.size(), - copy_bitmask(needles, stream, mr), - needles.null_count(), - stream, - mr); - if (needles.is_empty()) { return result; } - - auto const out_begin = result->mutable_view().template begin(); - if (haystack.is_empty()) { - thrust::uninitialized_fill( - rmm::exec_policy(stream), out_begin, out_begin + needles.size(), false); - return result; - } - - auto const haystack_set = cudf::detail::unordered_multiset::create(haystack, stream); - auto const haystack_set_dv = haystack_set.to_device(); - auto const needles_cdv_ptr = column_device_view::create(needles, stream); - - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(needles.size()), - out_begin, - contains_fn{ - haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()}); - - result->set_null_count(needles.null_count()); - - return result; - } - - template ())> + template std::unique_ptr operator()(column_view const& haystack, column_view const& needles, rmm::cuda_stream_view stream, @@ -144,8 +81,6 @@ std::unique_ptr contains(column_view const& haystack, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch"); - return cudf::type_dispatcher( haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr); } From 5d311ea76ddc8bdbb357b6afdf64dfce6ece39a7 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 5 Oct 2023 13:21:31 -0400 Subject: [PATCH 07/12] Fix strings replace for adjacent, identical multi-byte UTF-8 character targets (#14235) Fixes bug that can occur when replacing all occurrences in a string using a multi-byte UTF-8 target when the target matches sequentially in the same string -- some characters were missed. Specialized gtest is also added. Found while working on #13891 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14235 --- cpp/src/strings/replace/replace.cu | 2 +- cpp/tests/strings/replace_tests.cpp | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index a622d1a742d..acc1502f4d6 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -97,7 +97,7 @@ struct replace_row_parallel_fn { } else { bytes += d_repl.size_bytes() - d_target.size_bytes(); } - position = d_str.find(d_target, position + d_target.size_bytes()); + position = d_str.find(d_target, position + d_target.length()); --max_n; } if (out_ptr) // copy whats left (or right depending on your point of view) diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index f143983aded..f04bb832f09 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget) +{ + auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"}); + auto strings_view = cudf::strings_column_view(input); + // replace all occurrences of 'é' with 'e' + cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"}); + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + + auto target = cudf::string_scalar("é", true, stream); + auto repl = cudf::string_scalar("e", true, stream); + auto results = cudf::strings::replace(strings_view, target, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceTest, ReplaceSlice) { std::vector h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""}; From 04e2cd6ff4d525390d4a416651cefa16e11c2a50 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 6 Oct 2023 09:33:16 -0400 Subject: [PATCH 08/12] cudf::detail::pinned_allocator doesn't throw from `deallocate` (#14251) Fixes #14165 The deallocate function is called by the `pinned_host_vector`. Throwing from destructors is bad since they can't be caught, and generally get converted into runtime sig aborts. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - David Wendt (https://github.com/davidwendt) - Divye Gala (https://github.com/divyegala) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/14251 --- cpp/include/cudf/detail/utilities/pinned_host_vector.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp index 9e2b85ea129..eee974c8399 100644 --- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp @@ -169,7 +169,12 @@ class pinned_allocator { * It is the responsibility of the caller to destroy * the objects stored at \p p. */ - __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); } + __host__ inline void deallocate(pointer p, size_type /*cnt*/) + { + auto dealloc_worked = cudaFreeHost(p); + (void)dealloc_worked; + assert(dealloc_worked == cudaSuccess); + } /** * @brief This method returns the maximum size of the \c cnt parameter From fc3694730334971c6c7bd916bf36b71302cfcd42 Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Fri, 6 Oct 2023 14:03:32 -0400 Subject: [PATCH 09/12] Fixing parquet list of struct interpretation (#13715) This change alters how we interpret non-annotated data in a parquet file. Most modern parquet writers would produce something like: ``` message spark_schema { required int32 id; optional group phoneNumbers (LIST) { repeated group phone { required int64 number; optional binary kind (STRING); } } } ``` But the list annotation isn't required. If it didn't exist, we would incorrectly interpret this schema as a struct of struct and not a list of struct. This change alters the code to look at the child and see if it is repeated. If it is, this indicates a list. closes #13664 Authors: - Mike Wilson (https://github.com/hyperbolic2346) - Vukasin Milovanovic (https://github.com/vuule) - Mark Harris (https://github.com/harrism) Approvers: - Mark Harris (https://github.com/harrism) - Nghia Truong (https://github.com/ttnghia) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/13715 --- cpp/src/io/parquet/page_decode.cuh | 2 +- cpp/src/io/parquet/parquet.hpp | 2 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 86 ++++++++++++++++++++-- cpp/src/io/parquet/reader_impl_helpers.hpp | 1 + cpp/tests/io/parquet_test.cpp | 78 ++++++++++++++++++++ 5 files changed, 162 insertions(+), 7 deletions(-) diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index cdc29197eb3..d70cabdd35f 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -753,7 +753,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // for nested schemas, it's more complicated. This warp will visit 32 incoming values, // however not all of them will necessarily represent a value at this nesting level. so // the validity bit for thread t might actually represent output value t-6. the correct - // position for thread t's bit is cur_value_count. for cuda 11 we could use + // position for thread t's bit is thread_value_count. for cuda 11 we could use // __reduce_or_sync(), but until then we have to do a warp reduce. WarpReduceOr32(is_valid << thread_value_count); diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index c2affc774c2..1df49262e87 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -206,7 +206,7 @@ struct SchemaElement { { return type == UNDEFINED_TYPE && // this assumption might be a little weak. - ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children == 2)); + ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children > 1)); } }; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index fcaa610fbb7..9778cfc47d2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -175,6 +175,81 @@ type_id to_type_id(SchemaElement const& schema, return type_id::EMPTY; } +void metadata::sanitize_schema() +{ + // Parquet isn't very strict about incoming metadata. Lots of things can and should be inferred. + // There are also a lot of rules that simply aren't followed and are expected to be worked around. + // This step sanitizes the metadata to something that isn't ambiguous. + // + // Take, for example, the following schema: + // + // required group field_id=-1 user { + // required int32 field_id=-1 id; + // optional group field_id=-1 phoneNumbers { + // repeated group field_id=-1 phone { + // required int64 field_id=-1 number; + // optional binary field_id=-1 kind (String); + // } + // } + // } + // + // This real-world example has no annotations telling us what is a list or a struct. On the + // surface this looks like a column of id's and a column of list>, but this + // actually should be interpreted as a struct>>. The phoneNumbers field + // has to be a struct because it is a group with no repeated tag and we have no annotation. The + // repeated group is actually BOTH a struct due to the multiple children and a list due to + // repeated. + // + // This code attempts to make this less messy for the code that follows. + + std::function process = [&](size_t schema_idx) -> void { + if (schema_idx < 0) { return; } + auto& schema_elem = schema[schema_idx]; + if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) { + auto const parent_type = schema[schema_elem.parent_idx].converted_type; + if (schema_elem.repetition_type == REPEATED && schema_elem.num_children > 1 && + parent_type != LIST && parent_type != MAP) { + // This is a list of structs, so we need to mark this as a list, but also + // add a struct child and move this element's children to the struct + schema_elem.converted_type = LIST; + schema_elem.repetition_type = OPTIONAL; + auto const struct_node_idx = schema.size(); + + SchemaElement struct_elem; + struct_elem.name = "struct_node"; + struct_elem.repetition_type = REQUIRED; + struct_elem.num_children = schema_elem.num_children; + struct_elem.type = UNDEFINED_TYPE; + struct_elem.converted_type = UNKNOWN; + + // swap children + struct_elem.children_idx = std::move(schema_elem.children_idx); + schema_elem.children_idx = {struct_node_idx}; + schema_elem.num_children = 1; + + struct_elem.max_definition_level = schema_elem.max_definition_level; + struct_elem.max_repetition_level = schema_elem.max_repetition_level; + schema_elem.max_definition_level--; + schema_elem.max_repetition_level = schema[schema_elem.parent_idx].max_repetition_level; + + // change parent index on new node and on children + struct_elem.parent_idx = schema_idx; + for (auto& child_idx : struct_elem.children_idx) { + schema[child_idx].parent_idx = struct_node_idx; + } + // add our struct + schema.push_back(struct_elem); + } + } + + for (auto& child_idx : schema_elem.children_idx) { + process(child_idx); + } + }; + + process(0); +} + metadata::metadata(datasource* source) { constexpr auto header_len = sizeof(file_header_s); @@ -195,6 +270,7 @@ metadata::metadata(datasource* source) CompactProtocolReader cp(buffer->data(), ender->footer_len); CUDF_EXPECTS(cp.read(this), "Cannot parse metadata"); CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); + sanitize_schema(); } std::vector aggregate_reader_metadata::metadatas_from_sources( @@ -445,8 +521,10 @@ aggregate_reader_metadata::select_columns(std::optional child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent); } + auto const one_level_list = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx)); + // if we're at the root, this is a new output column - auto const col_type = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx)) + auto const col_type = one_level_list ? type_id::LIST : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); @@ -485,7 +563,7 @@ aggregate_reader_metadata::select_columns(std::optional input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0}); // set up child output column for one-level encoding list - if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { + if (one_level_list) { // determine the element data type auto const element_type = to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); @@ -506,9 +584,7 @@ aggregate_reader_metadata::select_columns(std::optional std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); // pop off the extra nesting element. - if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { - nesting.pop_back(); - } + if (one_level_list) { nesting.pop_back(); } path_is_valid = true; // If we're able to reach leaf then path is valid } diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 61e4f94df0f..9ee17f26a10 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -58,6 +58,7 @@ using namespace cudf::io::parquet; */ struct metadata : public FileMetaData { explicit metadata(datasource* source); + void sanitize_schema(); }; class aggregate_reader_metadata { diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 81e0e12eeb9..73c946a5feb 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -6732,4 +6732,82 @@ TEST_P(ParquetV2Test, CheckEncodings) } } +TEST_F(ParquetReaderTest, RepeatedNoAnnotations) +{ + constexpr unsigned char repeated_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, + 0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02, + 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15, + 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15, + 0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01, + 0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00, + 0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15, + 0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02, + 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02, + 0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00, + 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d, + 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10, + 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c, + 0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69, + 0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04, + 0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c, + 0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19, + 0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, + 0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda, + 0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, + 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, + 0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16, + 0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75, + 0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39, + 0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38, + 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + auto read_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); + auto result = cudf::io::read_parquet(read_opts); + + EXPECT_EQ(result.tbl->view().column(0).size(), 6); + EXPECT_EQ(result.tbl->view().num_columns(), 2); + + column_wrapper col0{1, 2, 3, 4, 5, 6}; + column_wrapper child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}}; + cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}}; + auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}}; + + auto list_offsets_column = + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 1, 2, 5}.release(); + auto num_list_rows = list_offsets_column->size() - 1; + + auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 0, 2, false); + + auto list_col = cudf::make_lists_column( + num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask)); + + std::vector> struct_children; + struct_children.push_back(std::move(list_col)); + + auto outer_struct = + cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}}; + table_view expected{{col0, outer_struct}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected); +} + CUDF_TEST_PROGRAM_MAIN() From 96664ec7436033f59aa5b9740e6f54aec707e3cf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 6 Oct 2023 15:09:11 -0700 Subject: [PATCH 10/12] Add pylibcudf.Scalar that interoperates with Arrow scalars (#14133) This PR adds a new Scalar object to pylibcudf that will function as the pylibcudf equivalent of cudf::scalar. Unlike columns, which are typically operated on in the form of views rather than owning types by libcudf, owning scalars are accepted by (const) ref by libcudf APIs and no corresponding view type exists. Therefore, pylibcudf.Scalar differs from pylibcudf.Column by actually owning an instance of the underlying libcudf type (cudf::scalar). Construction of pylibcudf Scalars is expected to be done from an Arrow scalar. This PR relies on #14124 and should not be merged until after that one. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14133 --- python/cudf/cudf/_lib/CMakeLists.txt | 8 +- python/cudf/cudf/_lib/datetime.pyx | 6 +- python/cudf/cudf/_lib/interop.pyx | 95 +------------ python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 8 ++ .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 25 +++- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 5 +- python/cudf/cudf/_lib/pylibcudf/__init__.py | 5 +- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 9 ++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 23 +++ python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 32 +++++ python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 133 ++++++++++++++++++ python/cudf/cudf/_lib/pylibcudf/table.pxd | 3 + python/cudf/cudf/_lib/pylibcudf/table.pyx | 33 ++++- python/cudf/cudf/_lib/scalar.pxd | 13 +- python/cudf/cudf/_lib/scalar.pyx | 88 ++++++++---- python/cudf/cudf/_lib/strings/CMakeLists.txt | 10 +- .../cudf/_lib/strings/convert/CMakeLists.txt | 10 +- .../cudf/_lib/strings/split/CMakeLists.txt | 10 +- 18 files changed, 378 insertions(+), 138 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pyx create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 947659c290a..1b543b94589 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -107,8 +107,12 @@ if(${PYARROW_RESULT}) message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}") endif() -set(targets_using_arrow_headers interop avro csv orc json parquet) -foreach(target IN LISTS targets_using_arrow_headers) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 81949dbaa20..3d96f59c4d6 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -10,6 +10,7 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.filling cimport calendrical_month_sequence +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar @@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset): + offset.kwds.get("months", 0) ) + cdef const scalar* c_start = start.c_value.get() with nogil: c_result = move(calendrical_month_sequence( n, - start.c_value.get()[0], + c_start[0], months )) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 639754fc54f..8fd2a409d90 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -4,14 +4,7 @@ from cpython cimport pycapsule from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from pyarrow.lib cimport ( - CScalar, - CTable, - pyarrow_unwrap_scalar, - pyarrow_unwrap_table, - pyarrow_wrap_scalar, - pyarrow_wrap_table, -) +from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table from cudf._lib.cpp.interop cimport ( DLManagedTensor, @@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport ( to_arrow as cpp_to_arrow, to_dlpack as cpp_to_dlpack, ) -from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport type_id -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) -from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype def from_dlpack(dlpack_capsule): @@ -199,79 +182,3 @@ def from_arrow(object input_table): c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) return columns_from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def to_arrow_scalar(DeviceScalar source_scalar): - """Convert a scalar to a PyArrow scalar. - - Parameters - ---------- - source_scalar : the scalar to convert - - Returns - ------- - pyarrow.lib.Scalar - """ - cdef vector[column_metadata] cpp_metadata = gather_metadata( - [("", source_scalar.dtype)] - ) - cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr() - - cdef shared_ptr[CScalar] cpp_arrow_scalar - with nogil: - cpp_arrow_scalar = cpp_to_arrow( - source_scalar_ptr[0], cpp_metadata[0] - ) - - return pyarrow_wrap_scalar(cpp_arrow_scalar) - - -@acquire_spill_lock() -def from_arrow_scalar(object input_scalar, output_dtype=None): - """Convert from PyArrow scalar to a cudf scalar. - - Parameters - ---------- - input_scalar : PyArrow scalar - output_dtype : output type to cast to, ignored except for decimals - - Returns - ------- - cudf._lib.DeviceScalar - """ - cdef shared_ptr[CScalar] cpp_arrow_scalar = ( - pyarrow_unwrap_scalar(input_scalar) - ) - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0])) - - cdef type_id ctype = c_result.get().type().id() - if ctype == type_id.DECIMAL128: - if output_dtype is None: - # Decimals must be cast to the cudf dtype of the right width - raise ValueError( - "Decimal scalars must be constructed with a dtype" - ) - - if isinstance(output_dtype, Decimal32Dtype): - c_result.reset( - new fixed_point_scalar[decimal32]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - elif isinstance(output_dtype, Decimal64Dtype): - c_result.reset( - new fixed_point_scalar[decimal64]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - # Decimal128Dtype is a no-op, no conversion needed. - - return DeviceScalar.from_unique_ptr(move(c_result), output_dtype) diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 515b9c1d6e4..d4e2392ee04 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0ce42dc43ff..5185b2d4bb5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,10 +12,33 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx) +set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx + types.pyx utils.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) + +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" + OUTPUT_VARIABLE PYARROW_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() + +# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for +# more info +find_package(NumPy REQUIRED) +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. + # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index ba7822b0a54..7a35854392c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,9 +1,10 @@ # Copyright (c) 2023, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport copying +from . cimport copying, interop from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved @@ -12,7 +13,9 @@ from .types cimport DataType __all__ = [ "Column", "DataType", + "Scalar", "Table", "copying", "gpumemoryview", + "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 3edff9a53e8..72b74a57b87 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,16 +1,19 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from . import copying +from . import copying, interop from .column import Column from .gpumemoryview import gpumemoryview +from .scalar import Scalar from .table import Table from .types import DataType, TypeId __all__ = [ "Column", "DataType", + "Scalar", "Table", "TypeId", "copying", "gpumemoryview", + "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd new file mode 100644 index 00000000000..3a79e5425d4 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cudf._lib.cpp.interop cimport column_metadata + + +cdef class ColumnMetadata: + cdef public object name + cdef public object children_meta + cdef column_metadata to_libcudf(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx new file mode 100644 index 00000000000..0cdca275027 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -0,0 +1,23 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cudf._lib.cpp.interop cimport column_metadata + + +cdef class ColumnMetadata: + def __init__(self, name): + self.name = name + self.children_meta = [] + + cdef column_metadata to_libcudf(self): + """Convert to C++ column_metadata. + + Since this class is mutable and cheap, it is easier to create the C++ + object on the fly rather than have it directly backing the storage for + the Cython class. + """ + cdef column_metadata c_metadata + cdef ColumnMetadata child_meta + c_metadata.name = self.name.encode() + for child_meta in self.children_meta: + c_metadata.children_meta.push_back(child_meta.to_libcudf()) + return c_metadata diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd new file mode 100644 index 00000000000..09d853d832f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -0,0 +1,32 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa + +from rmm._lib.memory_resource cimport DeviceMemoryResource + +from cudf._lib.cpp.scalar.scalar cimport scalar + +from .interop cimport ColumnMetadata +from .types cimport DataType + + +cdef class Scalar: + cdef unique_ptr[scalar] c_obj + cdef DataType _data_type + + # Holds a reference to the DeviceMemoryResource used for allocation. + # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is + # needed for deallocation + cdef DeviceMemoryResource mr + + cdef const scalar* get(self) except * + + cpdef DataType type(self) + cpdef bool is_valid(self) + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) + + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx new file mode 100644 index 00000000000..04f588bd3e6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -0,0 +1,133 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cython cimport no_gc_clear +from cython.operator cimport dereference +from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.utility cimport move +from pyarrow cimport lib as pa + +from rmm._lib.memory_resource cimport get_current_device_resource + +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) +from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) + +from .interop cimport ColumnMetadata +from .types cimport DataType, type_id + + +# The DeviceMemoryResource attribute could be released prematurely +# by the gc if the Scalar is in a reference cycle. Removing the tp_clear +# function with the no_gc_clear decoration prevents that. See +# https://github.com/rapidsai/rmm/pull/931 for details. +@no_gc_clear +cdef class Scalar: + """A scalar value in device memory.""" + # Unlike for columns, libcudf does not support scalar views. All APIs that + # accept scalar values accept references to the owning object rather than a + # special view type. As a result, pylibcudf.Scalar has a simpler structure + # than pylibcudf.Column because it can be a true wrapper around a libcudf + # column + + def __cinit__(self, *args, **kwargs): + self.mr = get_current_device_resource() + + def __init__(self, pa.Scalar value=None): + # TODO: This case is not something we really want to + # support, but it here for now to ease the transition of + # DeviceScalar. + if value is not None: + raise ValueError("Scalar should be constructed with a factory") + + @staticmethod + def from_arrow(pa.Scalar value, DataType data_type=None): + # Allow passing a dtype, but only for the purpose of decimals for now + + cdef shared_ptr[pa.CScalar] cscalar = ( + pa.pyarrow_unwrap_scalar(value) + ) + cdef unique_ptr[scalar] c_result + + with nogil: + c_result = move(cpp_from_arrow(cscalar.get()[0])) + + cdef Scalar s = Scalar.from_libcudf(move(c_result)) + + if s.type().id() != type_id.DECIMAL128: + if data_type is not None: + raise ValueError( + "dtype may not be passed for non-decimal types" + ) + return s + + if data_type is None: + raise ValueError( + "Decimal scalars must be constructed with a dtype" + ) + + cdef type_id tid = data_type.id() + + if tid == type_id.DECIMAL32: + s.c_obj.reset( + new fixed_point_scalar[decimal32]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid == type_id.DECIMAL64: + s.c_obj.reset( + new fixed_point_scalar[decimal64]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid != type_id.DECIMAL128: + raise ValueError( + "Decimal scalars may only be cast to decimals" + ) + + return s + + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata): + cdef shared_ptr[pa.CScalar] c_result + cdef column_metadata c_metadata = metadata.to_libcudf() + + with nogil: + c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata)) + + return pa.pyarrow_wrap_scalar(c_result) + + cdef const scalar* get(self) except *: + return self.c_obj.get() + + cpdef DataType type(self): + """The type of data in the column.""" + return self._data_type + + cpdef bool is_valid(self): + """True if the scalar is valid, false if not""" + return self.get().is_valid() + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None): + """Construct a Scalar object from a libcudf scalar. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef Scalar s = Scalar.__new__(Scalar) + s.c_obj.swap(libcudf_scalar) + s._data_type = DataType.from_libcudf(s.get().type()) + return s diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 95f197b13eb..a9e2874232a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -16,3 +17,5 @@ cdef class Table: cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) cpdef list columns(self) + + cpdef pa.Table to_arrow(self, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index 720f9815bd6..c41eb82e4a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -1,15 +1,22 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector +from pyarrow cimport lib as pa from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) from cudf._lib.cpp.table.table cimport table from .column cimport Column +from .interop cimport ColumnMetadata cdef class Table: @@ -60,3 +67,27 @@ cdef class Table: cpdef list columns(self): return self._columns + + @staticmethod + def from_arrow(pa.Table pyarrow_table): + cdef shared_ptr[pa.CTable] ctable = ( + pa.pyarrow_unwrap_table(pyarrow_table) + ) + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_from_arrow(ctable.get()[0])) + + return Table.from_libcudf(move(c_result)) + + cpdef pa.Table to_arrow(self, list metadata): + cdef shared_ptr[pa.CTable] c_result + cdef vector[column_metadata] c_metadata + cdef ColumnMetadata meta + for meta in metadata: + c_metadata.push_back(meta.to_libcudf()) + + with nogil: + c_result = move(cpp_to_arrow(self.view(), c_metadata)) + + return pa.pyarrow_wrap_table(c_result) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 1deed60d67d..77733f59c3d 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,20 +1,19 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource +# TODO: Would like to remove this cimport, but it will require some more work +# to excise all C code in scalar.pyx that relies on using the C API of the +# pylibcudf Scalar underlying the DeviceScalar. +from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: - cdef unique_ptr[scalar] c_value - - # Holds a reference to the DeviceMemoryResource used for allocation. - # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is - # needed for deallocation - cdef DeviceMemoryResource mr + cdef pylibcudf.Scalar c_value cdef object _dtype diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 5ab286c5701..0b64c75f7b6 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -cimport cython - import copy import numpy as np @@ -13,17 +11,17 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from rmm._lib.memory_resource cimport get_current_device_resource - import cudf +from cudf._lib import pylibcudf from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ListDtype, StructDtype +from cudf.core.dtypes import ( + ListDtype, + StructDtype, + is_list_dtype, + is_struct_dtype, +) from cudf.core.missing import NA, NaT -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id - -from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar - cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, @@ -44,6 +42,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) +from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id def _replace_nested(obj, check, replacement): @@ -61,15 +60,44 @@ def _replace_nested(obj, check, replacement): _replace_nested(v, check, replacement) -# The DeviceMemoryResource attribute could be released prematurely -# by the gc if the DeviceScalar is in a reference cycle. Removing -# the tp_clear function with the no_gc_clear decoration prevents that. -# See https://github.com/rapidsai/rmm/pull/931 for details. -@cython.no_gc_clear +def gather_metadata(dtypes): + """Convert a dict of dtypes to a list of ColumnMetadata objects. + + The metadata is constructed recursively so that nested types are + represented as nested ColumnMetadata objects. + + Parameters + ---------- + dtypes : dict + A dict mapping column names to dtypes. + + Returns + ------- + List[ColumnMetadata] + A list of ColumnMetadata objects. + """ + out = [] + for name, dtype in dtypes.items(): + v = pylibcudf.interop.ColumnMetadata(name) + if is_struct_dtype(dtype): + v.children_meta = gather_metadata(dtype.fields) + elif is_list_dtype(dtype): + # Offsets column is unnamed and has no children + v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) + v.children_meta.extend( + gather_metadata({"": dtype.element_type}) + ) + out.append(v) + return out + + cdef class DeviceScalar: + # TODO: I think this should be removable, except that currently the way + # that from_unique_ptr is implemented is probably dereferencing this in an + # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.mr = get_current_device_resource() + self.c_value = pylibcudf.Scalar() def __init__(self, value, dtype): """ @@ -85,7 +113,7 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') + dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') if cudf.utils.utils.is_na_like(value): value = None @@ -108,10 +136,17 @@ cdef class DeviceScalar: pa_scalar = pa.scalar(value, type=pa_type) - # Note: This factory-like behavior in __init__ will be removed when - # migrating to pylibcudf. - cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype) - self.c_value.swap(obj.c_value) + data_type = None + if isinstance(dtype, cudf.core.dtypes.DecimalDtype): + tid = pylibcudf.TypeId.DECIMAL128 + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + tid = pylibcudf.TypeId.DECIMAL32 + elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): + tid = pylibcudf.TypeId.DECIMAL64 + data_type = pylibcudf.DataType(tid, -dtype.scale) + + self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type) + self._dtype = dtype def _to_host_scalar(self): is_datetime = self.dtype.kind == "M" @@ -119,7 +154,8 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA - ps = to_arrow_scalar(self) + metadata = gather_metadata({"": self.dtype})[0] + ps = self.c_value.to_arrow(metadata) if not ps.is_valid: return null_type @@ -158,13 +194,13 @@ cdef class DeviceScalar: return self._to_host_scalar() cdef const scalar* get_raw_ptr(self) except *: - return self.c_value.get() + return self.c_value.c_obj.get() cpdef bool is_valid(self): """ Returns if the Scalar is valid or not(i.e., ). """ - return self.get_raw_ptr()[0].is_valid() + return self.c_value.is_valid() def __repr__(self): if cudf.utils.utils.is_na_like(self.value): @@ -183,7 +219,7 @@ cdef class DeviceScalar: cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) cdef libcudf_types.data_type cdtype - s.c_value = move(ptr) + s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) cdtype = s.get_raw_ptr()[0].type() if dtype is not None: @@ -310,9 +346,9 @@ def _create_proxy_nat_scalar(dtype): if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True) + _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True) + _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) return result else: raise TypeError('NAT only valid for datetime and timedelta') diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index a5e87a456cb..fc11f047ab4 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -40,6 +40,14 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt index 434f79d3b5f..f55bb1fb780 100644 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt index 59a22c06e85..2f2063482af 100644 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -20,3 +20,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() From e28017cc17d2feb050d2effd4ebafb84600fd607 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Mon, 9 Oct 2023 10:05:12 -0500 Subject: [PATCH 11/12] Cleanup of namespaces in parquet code. (#14259) Cleans up several issues in the parquet code: - We were using the namespace `cudf::io::detail::parquet`, when `cudf::io::parquet::detail` makes more sense. - Converts the `cudf::io::parquet::gpu` namespace to also just use `cudf::io::parquet::detail` - Several detail-style headers and source files were using `cudf::io::parquet` when they should probably have been in the detail namespace. Authors: - https://github.com/nvdbaranec Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/14259 --- cpp/include/cudf/io/detail/parquet.hpp | 8 +- cpp/include/cudf/io/parquet.hpp | 4 +- cpp/src/io/functions.cpp | 4 +- cpp/src/io/parquet/chunk_dict.cu | 19 +- .../io/parquet/compact_protocol_reader.cpp | 8 +- .../io/parquet/compact_protocol_reader.hpp | 9 +- .../io/parquet/compact_protocol_writer.cpp | 8 +- .../io/parquet/compact_protocol_writer.hpp | 8 +- cpp/src/io/parquet/decode_preprocess.cu | 10 +- cpp/src/io/parquet/delta_binary.cuh | 4 +- cpp/src/io/parquet/page_data.cu | 12 +- cpp/src/io/parquet/page_decode.cuh | 4 +- cpp/src/io/parquet/page_delta_decode.cu | 6 +- cpp/src/io/parquet/page_enc.cu | 22 +- cpp/src/io/parquet/page_hdr.cu | 14 +- cpp/src/io/parquet/page_string_decode.cu | 14 +- cpp/src/io/parquet/page_string_utils.cuh | 4 +- cpp/src/io/parquet/parquet.hpp | 9 +- cpp/src/io/parquet/parquet_common.hpp | 9 +- cpp/src/io/parquet/parquet_gpu.cuh | 4 +- cpp/src/io/parquet/parquet_gpu.hpp | 27 +- cpp/src/io/parquet/predicate_pushdown.cpp | 14 +- cpp/src/io/parquet/reader.cpp | 4 +- cpp/src/io/parquet/reader_impl.cpp | 36 +-- cpp/src/io/parquet/reader_impl.hpp | 12 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 121 ++++---- cpp/src/io/parquet/reader_impl_helpers.hpp | 21 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 259 +++++++++--------- cpp/src/io/parquet/rle_stream.cuh | 4 +- cpp/src/io/parquet/writer_impl.cu | 223 ++++++++------- cpp/src/io/parquet/writer_impl.hpp | 28 +- cpp/tests/io/parquet_test.cpp | 207 +++++++------- 32 files changed, 531 insertions(+), 605 deletions(-) diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 074f690d2c7..0b8ee9676de 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -38,7 +38,7 @@ class parquet_reader_options; class parquet_writer_options; class chunked_parquet_writer_options; -namespace detail::parquet { +namespace parquet::detail { /** * @brief Class to read Parquet dataset data into columns. @@ -186,7 +186,7 @@ class writer { */ explicit writer(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -201,7 +201,7 @@ class writer { */ explicit writer(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -250,5 +250,5 @@ class writer { * metadata. */ parquet_metadata read_parquet_metadata(host_span const> sources); -} // namespace detail::parquet +} // namespace parquet::detail } // namespace cudf::io diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index deaf23d405a..6283099e700 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -499,7 +499,7 @@ class chunked_parquet_reader { [[nodiscard]] table_with_metadata read_chunk() const; private: - std::unique_ptr reader; + std::unique_ptr reader; }; /** @} */ // end of group @@ -1750,7 +1750,7 @@ class parquet_chunked_writer { std::vector const& column_chunks_file_paths = {}); /// Unique pointer to impl writer class - std::unique_ptr writer; + std::unique_ptr writer; }; /** @} */ // end of group diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 392a7850886..726442d752e 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -470,8 +470,8 @@ void orc_chunked_writer::close() writer->close(); } -using namespace cudf::io::detail::parquet; -namespace detail_parquet = cudf::io::detail::parquet; +using namespace cudf::io::parquet::detail; +namespace detail_parquet = cudf::io::parquet::detail; table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 9ff1869edde..53ff31ab0a7 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -24,10 +24,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } @@ -101,7 +99,7 @@ struct map_find_fn { template __global__ void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) + populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size) template __global__ void __launch_bounds__(block_size) - get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) + get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st <<>>(chunks); } -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); @@ -290,14 +288,11 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi collect_map_entries_kernel<<>>(chunks); } -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel <<>>(frags); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 5c7b8ca3f8c..81d1be64a45 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -21,9 +21,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Base class for parquet field functors. @@ -870,6 +868,4 @@ int CompactProtocolReader::WalkSchema( } } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 619815db503..cbb4161b138 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -25,9 +25,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata * @@ -147,6 +146,4 @@ class CompactProtocolReader { friend class parquet_field_struct_blob; }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 60bc8984d81..9adc8767880 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -16,9 +16,7 @@ #include "compact_protocol_writer.hpp" -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Parquet CompactProtocolWriter class @@ -391,6 +389,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field) current_field_value = field; } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 26d66527aa5..4849a814b14 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -25,9 +25,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata @@ -115,6 +113,4 @@ class CompactProtocolFieldWriter { inline void set_current_field(int const& field); }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 8de3702bc2e..544c93ee616 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 2382e4aafdf..a513e6674b4 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -18,7 +18,7 @@ #include "page_decode.cuh" -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // DELTA_XXX encoding support // @@ -291,4 +291,4 @@ struct delta_binary_decoder { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 230834632dd..cce3659b902 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -624,7 +621,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector } /** - * @copydoc cudf::io::parquet::gpu::DecodePageData + * @copydoc cudf::io::parquet::detail::DecodePageData */ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -648,7 +645,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index d70cabdd35f..7c866fd8b9e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -24,7 +24,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { struct page_state_s { constexpr page_state_s() noexcept {} @@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, return true; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 2b78dead205..d25684a59f3 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -23,7 +23,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { namespace { @@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary + * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary */ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fe0dbb85124..78873d5e8ca 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -41,10 +41,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -329,7 +326,7 @@ __global__ void __launch_bounds__(128) // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -998,7 +995,7 @@ __device__ auto julian_days_with_time(int64_t v) // blockDim(128, 1, 1) template __global__ void __launch_bounds__(128, 8) - gpuEncodePages(device_span pages, + gpuEncodePages(device_span pages, device_span> comp_in, device_span> comp_out, device_span comp_results, @@ -1988,7 +1985,7 @@ __global__ void __launch_bounds__(128) // blockDim(1024, 1, 1) __global__ void __launch_bounds__(1024) - gpuGatherPages(device_span chunks, device_span pages) + gpuGatherPages(device_span chunks, device_span pages) { __shared__ __align__(8) EncColumnChunk ck_g; __shared__ __align__(8) EncPage page_g; @@ -2265,7 +2262,7 @@ void InitFragmentStatistics(device_span groups, } void InitEncoderPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -2294,7 +2291,7 @@ void InitEncoderPages(device_2dspan chunks, write_v2_headers); } -void EncodePages(device_span pages, +void EncodePages(device_span pages, bool write_v2_headers, device_span> comp_in, device_span> comp_out, @@ -2328,7 +2325,7 @@ void EncodePageHeaders(device_span pages, } void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream) { gpuGatherPages<<>>(chunks, pages); @@ -2343,7 +2340,4 @@ void EncodeColumnIndexes(device_span chunks, chunks, column_stats, column_index_truncate_length); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 6f8b2f50443..eae8e05e61e 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -20,10 +20,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + // Minimal thrift implementation for parsing page headers // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md @@ -161,8 +159,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) * @param chunk Column chunk the page belongs to * @return `kernel_mask_bits` value for the given page */ -__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, - gpu::ColumnChunkDesc const& chunk) +__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk) { if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } @@ -528,7 +525,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, gpuBuildStringDictionaryIndex<<>>(chunks, num_chunks); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d79abe4a6d2..4d79770ec34 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -20,10 +20,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -757,7 +754,7 @@ __global__ void __launch_bounds__(decode_block_size) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes + * @copydoc cudf::io::parquet::detail::ComputePageStringSizes */ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -778,7 +775,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, } /** - * @copydoc cudf::io::parquet::gpu::DecodeStringPageData + * @copydoc cudf::io::parquet::detail::DecodeStringPageData */ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -802,7 +799,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pa } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index 9395599b3ff..a81d0a64466 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -18,7 +18,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. // copies from src to dst in 16B chunks per thread. @@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 1df49262e87..c5993d73dec 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -25,9 +25,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24)); /** @@ -405,6 +404,4 @@ static inline int CountLeadingZeros32(uint32_t value) #endif } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5a1716bb547..50736197eb9 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -18,9 +18,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + // Max decimal precisions according to the parquet spec: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal auto constexpr MAX_DECIMAL32_PRECISION = 9; @@ -156,6 +155,4 @@ enum FieldType { ST_FLD_STRUCT = 12, }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index dc74bee1536..10e12ebb782 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -23,7 +23,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; @@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx, return idx; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 51c862b376b..767668cc65e 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -35,7 +35,7 @@ #include -namespace cudf::io::parquet { +namespace cudf::io::parquet::detail { using cudf::io::detail::string_index_pair; @@ -88,8 +88,6 @@ struct input_column_info { auto nesting_depth() const { return nesting.size(); } }; -namespace gpu { - /** * @brief Enums for the flags in the page header */ @@ -347,7 +345,7 @@ struct file_intermediate_data { // all chunks from the selected row groups. We may end up reading these chunks progressively // instead of all at once - std::vector chunks{}; + std::vector chunks{}; // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we // may not be visiting every row group that contains these bounds @@ -372,16 +370,16 @@ struct pass_intermediate_data { // rowgroup, chunk and page information for the current pass. std::vector row_groups{}; - cudf::detail::hostdevice_vector chunks{}; - cudf::detail::hostdevice_vector pages_info{}; - cudf::detail::hostdevice_vector page_nesting_info{}; - cudf::detail::hostdevice_vector page_nesting_decode_info{}; + cudf::detail::hostdevice_vector chunks{}; + cudf::detail::hostdevice_vector pages_info{}; + cudf::detail::hostdevice_vector page_nesting_info{}; + cudf::detail::hostdevice_vector page_nesting_decode_info{}; rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; rmm::device_uvector page_index{0, rmm::cuda_stream_default}; rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; - std::vector output_chunk_read_info; + std::vector output_chunk_read_info; std::size_t current_output_chunk{0}; rmm::device_buffer level_decode_data{}; @@ -739,7 +737,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -762,7 +760,7 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -781,7 +779,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan * @param[in] stream CUDA stream to use */ void InitEncoderPages(cudf::detail::device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -847,7 +845,7 @@ void EncodePageHeaders(device_span pages, * @param[in] stream CUDA stream to use */ void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream); /** @@ -863,5 +861,4 @@ void EncodeColumnIndexes(device_span chunks, int32_t column_index_truncate_length, rmm::cuda_stream_view stream); -} // namespace gpu -} // namespace cudf::io::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 805d082c71e..9083be1c2dd 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { /** @@ -62,13 +62,13 @@ struct stats_caster { // uses storage type as T template () or cudf::is_nested())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_FAIL("unsupported type for stats casting"); } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); return targetType(*reinterpret_cast(stats_val)); @@ -78,7 +78,7 @@ struct stats_caster { template () and !cudf::is_boolean()) or cudf::is_fixed_point() or cudf::is_chrono())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case INT32: return targetType(*reinterpret_cast(stats_val)); @@ -103,7 +103,7 @@ struct stats_caster { } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case FLOAT: return targetType(*reinterpret_cast(stats_val)); @@ -113,7 +113,7 @@ struct stats_caster { } template )> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case BYTE_ARRAY: [[fallthrough]]; @@ -527,4 +527,4 @@ named_to_reference_converter::visit_operands( return transformed_operands; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index 1e87447006d..17d7c07bc91 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,7 +16,7 @@ #include "reader_impl.hpp" -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { reader::reader() = default; @@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); } table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index ea40f29a070..26ec83d5946 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -25,7 +25,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) { @@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); size_t const sum_max_depths = std::accumulate( - chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { + chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) { return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); @@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // doing a gather operation later on. // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). - auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0; + auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0; std::vector col_sizes(_input_columns.size(), 0L); if (has_strings) { - gpu::ComputePageStringSizes( + ComputePageStringSizes( pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream); col_sizes = calculate_page_string_offsets(); @@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) if (has_strings) { auto& stream = streams[s_idx++]; chunk_nested_str_data.host_to_device_async(stream); - gpu::DecodeStringPageData( + DecodeStringPageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream); } // launch delta binary decoder - if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) { - gpu::DecodeDeltaBinary( + if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) { + DecodeDeltaBinary( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } // launch the catch-all page decoder - if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) { - gpu::DecodePageData( + if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) { + DecodePageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } @@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // update null counts in the final column buffers for (size_t idx = 0; idx < pages.size(); idx++) { - gpu::PageInfo* pi = &pages[idx]; - if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } - gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; + PageInfo* pi = &pages[idx]; + if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } + ColumnChunkDesc* col = &chunks[pi->chunk_idx]; input_column_info const& input_col = _input_columns[col->src_col_index]; - int index = pi->nesting_decode - page_nesting_decode.device_ptr(); - gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; + int index = pi->nesting_decode - page_nesting_decode.device_ptr(); + PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; auto* cols = &_output_buffers; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { @@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit, // Save the states of the output buffers for reuse in `chunk_read()`. for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -368,7 +368,7 @@ void reader::impl::prepare_data(int64_t skip_rows, // always create the pass struct, even if we end up with no passes. // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() && not _input_columns.empty() && _current_input_pass < num_passes) { @@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk() if (_chunk_count > 0) { _output_buffers.resize(0); for (auto const& buff : _output_buffers_template) { - _output_buffers.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span con metadata.get_key_value_metadata()[0]}; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 9445e4d1648..6003b931b04 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { /** * @brief Implementation for Parquet reader @@ -261,10 +261,10 @@ class reader::impl { std::vector _input_columns; // Buffers for generating output columns - std::vector _output_buffers; + std::vector _output_buffers; // Buffers copied from `_output_buffers` after construction for reuse - std::vector _output_buffers_template; + std::vector _output_buffers_template; // _output_buffers associated schema indices std::vector _output_column_schemas; @@ -285,8 +285,8 @@ class reader::impl { // Within a pass, we produce one or more chunks of output, whose maximum total // byte size is controlled by _output_chunk_read_limit. - cudf::io::parquet::gpu::file_intermediate_data _file_itm_data; - std::unique_ptr _pass_itm_data; + file_intermediate_data _file_itm_data; + std::unique_ptr _pass_itm_data; // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents // the start/end of the chunks to be loaded for a given pass. @@ -301,4 +301,4 @@ class reader::impl { bool _file_preprocessed{false}; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 9778cfc47d2..171cf07da3e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -21,34 +21,34 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { ConvertedType logical_type_to_converted_type(LogicalType const& logical) { if (logical.isset.STRING) { - return parquet::UTF8; + return UTF8; } else if (logical.isset.MAP) { - return parquet::MAP; + return MAP; } else if (logical.isset.LIST) { - return parquet::LIST; + return LIST; } else if (logical.isset.ENUM) { - return parquet::ENUM; + return ENUM; } else if (logical.isset.DECIMAL) { - return parquet::DECIMAL; // TODO set decimal values + return DECIMAL; // TODO set decimal values } else if (logical.isset.DATE) { - return parquet::DATE; + return DATE; } else if (logical.isset.TIME) { if (logical.TIME.unit.isset.MILLIS) - return parquet::TIME_MILLIS; + return TIME_MILLIS; else if (logical.TIME.unit.isset.MICROS) - return parquet::TIME_MICROS; + return TIME_MICROS; } else if (logical.isset.TIMESTAMP) { if (logical.TIMESTAMP.unit.isset.MILLIS) - return parquet::TIMESTAMP_MILLIS; + return TIMESTAMP_MILLIS; else if (logical.TIMESTAMP.unit.isset.MICROS) - return parquet::TIMESTAMP_MICROS; + return TIMESTAMP_MICROS; } else if (logical.isset.INTEGER) { switch (logical.INTEGER.bitWidth) { case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; @@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical) default: break; } } else if (logical.isset.UNKNOWN) { - return parquet::NA; + return NA; } else if (logical.isset.JSON) { - return parquet::JSON; + return JSON; } else if (logical.isset.BSON) { - return parquet::BSON; + return BSON; } - return parquet::UNKNOWN; + return UNKNOWN; } } // namespace @@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - parquet::Type const physical = schema.type; - parquet::LogicalType const logical_type = schema.logical_type; - parquet::ConvertedType converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; + Type const physical = schema.type; + LogicalType const logical_type = schema.logical_type; + ConvertedType converted_type = schema.converted_type; + int32_t decimal_precision = schema.decimal_precision; // Logical type used for actual data interpretation; the legacy converted type // is superseded by 'logical' type whenever available. auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == parquet::DECIMAL) { + if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } + if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type.DECIMAL.precision; } switch (converted_type) { - case parquet::UINT_8: return type_id::UINT8; - case parquet::INT_8: return type_id::INT8; - case parquet::UINT_16: return type_id::UINT16; - case parquet::INT_16: return type_id::INT16; - case parquet::UINT_32: return type_id::UINT32; - case parquet::UINT_64: return type_id::UINT64; - case parquet::DATE: return type_id::TIMESTAMP_DAYS; - case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS; - case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS; - case parquet::TIMESTAMP_MILLIS: + case UINT_8: return type_id::UINT8; + case INT_8: return type_id::INT8; + case UINT_16: return type_id::UINT16; + case INT_16: return type_id::INT16; + case UINT_32: return type_id::UINT32; + case UINT_64: return type_id::UINT64; + case DATE: return type_id::TIMESTAMP_DAYS; + case TIME_MILLIS: return type_id::DURATION_MILLISECONDS; + case TIME_MICROS: return type_id::DURATION_MICROSECONDS; + case TIMESTAMP_MILLIS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MILLISECONDS; - case parquet::TIMESTAMP_MICROS: + case TIMESTAMP_MICROS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MICROSECONDS; - case parquet::DECIMAL: - if (physical == parquet::INT32) { return type_id::DECIMAL32; } - if (physical == parquet::INT64) { return type_id::DECIMAL64; } - if (physical == parquet::FIXED_LEN_BYTE_ARRAY) { + case DECIMAL: + if (physical == INT32) { return type_id::DECIMAL32; } + if (physical == INT64) { return type_id::DECIMAL64; } + if (physical == FIXED_LEN_BYTE_ARRAY) { if (schema.type_length <= static_cast(sizeof(int32_t))) { return type_id::DECIMAL32; } @@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema, return type_id::DECIMAL128; } } - if (physical == parquet::BYTE_ARRAY) { + if (physical == BYTE_ARRAY) { CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); if (decimal_precision <= MAX_DECIMAL32_PRECISION) { return type_id::DECIMAL32; @@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema, break; // maps are just List>. - case parquet::MAP: - case parquet::LIST: return type_id::LIST; - case parquet::NA: return type_id::STRING; + case MAP: + case LIST: return type_id::LIST; + case NA: return type_id::STRING; // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support default: break; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIMESTAMP.unit.isset.NANOS) { return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIME.unit.isset.NANOS) { return type_id::DURATION_NANOSECONDS; } @@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema, // Physical storage type supported by Parquet; controls the on-disk storage // format in combination with the encoding type. switch (physical) { - case parquet::BOOLEAN: return type_id::BOOL8; - case parquet::INT32: return type_id::INT32; - case parquet::INT64: return type_id::INT64; - case parquet::FLOAT: return type_id::FLOAT32; - case parquet::DOUBLE: return type_id::FLOAT64; - case parquet::BYTE_ARRAY: - case parquet::FIXED_LEN_BYTE_ARRAY: + case BOOLEAN: return type_id::BOOL8; + case INT32: return type_id::INT32; + case INT64: return type_id::INT64; + case FLOAT: return type_id::FLOAT32; + case DOUBLE: return type_id::FLOAT64; + case BYTE_ARRAY: + case FIXED_LEN_BYTE_ARRAY: // Can be mapped to INT32 (32-bit hash) or STRING return strings_to_categorical ? type_id::INT32 : type_id::STRING; - case parquet::INT96: + case INT96: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; default: break; @@ -420,7 +420,7 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con return names; } -std::tuple> +std::tuple> aggregate_reader_metadata::select_row_groups( host_span const> row_group_indices, int64_t skip_rows_opt, @@ -438,7 +438,7 @@ aggregate_reader_metadata::select_row_groups( host_span const>(filtered_row_group_indices.value()); } } - std::vector selection; + std::vector selection; auto [rows_to_skip, rows_to_read] = [&]() { if (not row_group_indices.empty()) { return std::pair{}; } auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options( @@ -478,7 +478,7 @@ aggregate_reader_metadata::select_row_groups( } std::tuple, - std::vector, + std::vector, std::vector> aggregate_reader_metadata::select_columns(std::optional> const& use_names, bool include_index, @@ -496,17 +496,18 @@ aggregate_reader_metadata::select_columns(std::optional : -1; }; - std::vector output_columns; + std::vector output_columns; std::vector input_columns; std::vector nesting; // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is // not a child of "struct1" then the function will return false for "struct1" - std::function&, bool)> + std::function&, bool)> build_column = [&](column_name_info const* col_name_info, int schema_idx, - std::vector& out_col_array, + std::vector& out_col_array, bool has_list_parent) { if (schema_idx < 0) { return false; } auto const& schema_elem = get_schema(schema_idx); @@ -529,7 +530,8 @@ aggregate_reader_metadata::select_columns(std::optional : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); - inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer output_col(dtype, + schema_elem.repetition_type == OPTIONAL); if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } // store the index of this element if inserted in out_col_array nesting.push_back(static_cast(out_col_array.size())); @@ -569,7 +571,8 @@ aggregate_reader_metadata::select_columns(std::optional to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const element_dtype = to_data_type(element_type, schema_elem); - inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer element_col( + element_dtype, schema_elem.repetition_type == OPTIONAL); if (has_list_parent || col_type == type_id::LIST) { element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } @@ -732,4 +735,4 @@ aggregate_reader_metadata::select_columns(std::optional std::move(input_columns), std::move(output_columns), std::move(output_column_schemas)); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 9ee17f26a10..1a73e2f55ac 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -32,9 +32,7 @@ #include #include -namespace cudf::io::detail::parquet { - -using namespace cudf::io::parquet; +namespace cudf::io::parquet::detail { /** * @brief Function that translates Parquet datatype to cuDF type enum @@ -182,7 +180,7 @@ class aggregate_reader_metadata { * @return A tuple of corrected row_start, row_count and list of row group indexes and its * starting row */ - [[nodiscard]] std::tuple> select_row_groups( + [[nodiscard]] std::tuple> select_row_groups( host_span const> row_group_indices, int64_t row_start, std::optional const& row_count, @@ -202,12 +200,13 @@ class aggregate_reader_metadata { * @return input column information, output column information, list of output column schema * indices */ - [[nodiscard]] std:: - tuple, std::vector, std::vector> - select_columns(std::optional> const& use_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) const; + [[nodiscard]] std::tuple, + std::vector, + std::vector> + select_columns(std::optional> const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const; }; /** @@ -276,4 +275,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer std::list _operators; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index c731c467f2c..4bc6bb6f43b 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -43,7 +43,8 @@ #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { + namespace { /** @@ -185,11 +186,11 @@ template */ [[nodiscard]] std::tuple conversion_info(type_id column_type_id, type_id timestamp_type_id, - parquet::Type physical, + Type physical, int8_t converted, int32_t length) { - int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; int32_t clock_rate = 0; if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { type_width = 1; // I32 -> I8 @@ -202,9 +203,9 @@ template } int8_t converted_type = converted; - if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && + if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal + converted_type = UNKNOWN; // Not converting to float64 or decimal } return std::make_tuple(type_width, clock_rate, converted_type); } @@ -226,7 +227,7 @@ template [[nodiscard]] std::future read_column_chunks_async( std::vector> const& sources, std::vector>& page_data, - cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, std::vector const& column_chunk_offsets, @@ -239,11 +240,10 @@ template size_t const io_offset = column_chunk_offsets[chunk]; size_t io_size = chunks[chunk].compressed_size; size_t next_chunk = chunk + 1; - bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED); + bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED); while (next_chunk < end_chunk) { - size_t const next_offset = column_chunk_offsets[next_chunk]; - bool const is_next_compressed = - (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED); + size_t const next_offset = column_chunk_offsets[next_chunk]; + bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED); if (next_offset != io_offset + io_size || is_next_compressed != is_compressed || chunk_source_map[chunk] != chunk_source_map[next_chunk]) { // Can't merge if not contiguous or mixing compressed and uncompressed @@ -300,13 +300,13 @@ template * * @return The total number of pages */ -[[nodiscard]] size_t count_page_headers( - cudf::detail::hostdevice_vector& chunks, rmm::cuda_stream_view stream) +[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector& chunks, + rmm::cuda_stream_view stream) { size_t total_pages = 0; chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); chunks.device_to_host_sync(stream); for (size_t c = 0; c < chunks.size(); c++) { @@ -337,8 +337,8 @@ constexpr bool is_supported_encoding(Encoding enc) * @param stream CUDA stream used for device memory operations and kernel launches * @returns The size in bytes of level type data required */ -int decode_page_headers(cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, +int decode_page_headers(cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), @@ -350,14 +350,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c } chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); // compute max bytes needed for level data auto level_bit_size = cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) { auto c = chunks[i]; return static_cast( - max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION])); + max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION])); }); // max level data bit size. int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), @@ -388,11 +388,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c * @return Device buffer to decompressed page data */ [[nodiscard]] rmm::device_buffer decompress_page_data( - cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { - auto for_each_codec_page = [&](parquet::Compression codec, std::function const& f) { + auto for_each_codec_page = [&](Compression codec, std::function const& f) { for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { const auto page_stride = chunks[c].max_num_pages; if (chunks[c].codec == codec) { @@ -412,19 +412,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c size_t total_decomp_size = 0; struct codec_stats { - parquet::Compression compression_type = UNCOMPRESSED; - size_t num_pages = 0; - int32_t max_decompressed_size = 0; - size_t total_decomp_size = 0; + Compression compression_type = UNCOMPRESSED; + size_t num_pages = 0; + int32_t max_decompressed_size = 0; + size_t total_decomp_size = 0; }; - std::array codecs{codec_stats{parquet::GZIP}, - codec_stats{parquet::SNAPPY}, - codec_stats{parquet::BROTLI}, - codec_stats{parquet::ZSTD}}; + std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}}; auto is_codec_supported = [&codecs](int8_t codec) { - if (codec == parquet::UNCOMPRESSED) return true; + if (codec == UNCOMPRESSED) return true; return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { return codec == cstats.compression_type; }) != codecs.end(); @@ -445,7 +442,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.num_pages++; num_comp_pages++; }); - if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) { + if (codec.compression_type == BROTLI && codec.num_pages > 0) { debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream); } } @@ -482,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c auto& page = pages[page_idx]; // offset will only be non-zero for V2 pages auto const offset = - page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION]; + page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION]; // for V2 need to copy def and rep level info into place, and then offset the // input and output buffers. otherwise we'd have to keep both the compressed // and decompressed data. @@ -509,11 +506,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { - case parquet::GZIP: + case GZIP: gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); break; - case parquet::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { + case SNAPPY: + if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, d_comp_in, d_comp_out, @@ -525,7 +522,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); } break; - case parquet::ZSTD: + case ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, d_comp_in, d_comp_out, @@ -534,7 +531,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.total_decomp_size, stream); break; - case parquet::BROTLI: + case BROTLI: gpu_debrotli(d_comp_in, d_comp_out, d_comp_res_view, @@ -594,9 +591,9 @@ void reader::impl::allocate_nesting_info() }); page_nesting_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; page_nesting_decode_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; // update pointers in the PageInfos int target_page_index = 0; @@ -653,10 +650,10 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - gpu::PageNestingInfo* pni = + PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; - gpu::PageNestingDecodeInfo* nesting_info = + PageNestingDecodeInfo* nesting_info = &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; // if we have lists, set our start and end depth remappings @@ -717,9 +714,9 @@ void reader::impl::allocate_level_decode_space() for (size_t idx = 0; idx < pages.size(); idx++) { auto& p = pages[idx]; - p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf; + p.lvl_decode_buf[level_type::DEFINITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); - p.lvl_decode_buf[gpu::level_type::REPETITION] = buf; + p.lvl_decode_buf[level_type::REPETITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); } } @@ -824,25 +821,25 @@ void reader::impl::load_global_chunk_info() schema.converted_type, schema.type_length); - chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); + chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); } remaining_rows -= row_group_rows; @@ -909,7 +906,7 @@ void reader::impl::compute_input_pass_row_group_info() void reader::impl::setup_pass() { // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); // setup row groups to be loaded for this pass auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass]; @@ -929,8 +926,7 @@ void reader::impl::setup_pass() auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - _pass_itm_data->chunks = - cudf::detail::hostdevice_vector(num_chunks, _stream); + _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); // adjust skip_rows and num_rows by what's available in the row groups we are processing @@ -970,7 +966,7 @@ void reader::impl::load_and_decompress_data() // Process dataset chunk pages into output columns auto const total_pages = count_page_headers(chunks, _stream); if (total_pages <= 0) { return; } - pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); + pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); // decoding of column/page information _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream); @@ -978,7 +974,7 @@ void reader::impl::load_and_decompress_data() decomp_page_data = decompress_page_data(chunks, pages, _stream); // Free compressed data for (size_t c = 0; c < chunks.size(); c++) { - if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } + if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } } } @@ -1019,14 +1015,13 @@ struct cumulative_row_info { }; #if defined(PREPROCESS_DEBUG) -void print_pages(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view _stream) +void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view _stream) { pages.device_to_host_sync(_stream); for (size_t idx = 0; idx < pages.size(); idx++) { auto const& p = pages[idx]; // skip dictionary pages - if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } + if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } printf( "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), " "str_bytes(%d)\n", @@ -1040,7 +1035,7 @@ void print_pages(cudf::detail::hostdevice_vector& pages, } } -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, rmm::device_uvector const& page_index, rmm::device_uvector const& c_info, rmm::cuda_stream_view stream) @@ -1067,7 +1062,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& printf("Schema %d\n", schemas[idx]); for (size_t pidx = 0; pidx < pages.size(); pidx++) { auto const& page = pages[h_page_index[pidx]]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { continue; } printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); @@ -1075,10 +1070,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& } } -void print_cumulative_row_info( - host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) +void print_cumulative_row_info(host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) { if (splits.has_value()) { printf("------------\nSplits\n"); @@ -1093,7 +1087,7 @@ void print_cumulative_row_info( if (splits.has_value()) { // if we have a split at this row count and this is the last instance of this row count auto start = thrust::make_transform_iterator( - splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); + splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); auto end = start + splits->size(); auto split = std::find(start, end, sizes[idx].row_count); auto const split_index = [&]() -> int { @@ -1180,12 +1174,12 @@ __device__ size_t row_size_functor::operator()(size_t num_rows, boo * Sums across all nesting levels. */ struct get_cumulative_row_info { - gpu::PageInfo const* const pages; + PageInfo const* const pages; __device__ cumulative_row_info operator()(size_type index) { auto const& page = pages[index]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return cumulative_row_info{0, 0, page.src_col_schema}; } @@ -1250,15 +1244,15 @@ struct row_total_size { * @param num_rows Total number of rows to read * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) { // now we have an array of {row_count, real output bytes}. just walk through it and generate // splits. // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; + std::vector splits; { size_t cur_pos = 0; size_t cur_cumulative_size = 0; @@ -1290,7 +1284,7 @@ std::vector find_splits(std::vector c auto const start_row = cur_row_count; cur_row_count = sizes[split_pos].row_count; - splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); + splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); cur_pos = split_pos; cur_cumulative_size = sizes[split_pos].size_bytes; } @@ -1311,12 +1305,11 @@ std::vector find_splits(std::vector c * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns * @param stream CUDA stream to use */ -std::vector compute_splits( - cudf::detail::hostdevice_vector& pages, - gpu::pass_intermediate_data const& id, - size_t num_rows, - size_t chunk_read_limit, - rmm::cuda_stream_view stream) +std::vector compute_splits(cudf::detail::hostdevice_vector& pages, + pass_intermediate_data const& id, + size_t num_rows, + size_t chunk_read_limit, + rmm::cuda_stream_view stream) { auto const& page_keys = id.page_keys; auto const& page_index = id.page_index; @@ -1395,16 +1388,16 @@ std::vector compute_splits( } struct get_page_chunk_idx { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; } + __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; } }; struct get_page_num_rows { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; } + __device__ size_type operator()(PageInfo const& page) { return page.num_rows; } }; struct get_page_column_index { - gpu::ColumnChunkDesc const* chunks; - __device__ size_type operator()(gpu::PageInfo const& page) + ColumnChunkDesc const* chunks; + __device__ size_type operator()(PageInfo const& page) { return chunks[page.chunk_idx].src_col_index; } @@ -1441,7 +1434,7 @@ struct get_page_nesting_size { input_col_info const* const input_cols; size_type const max_depth; size_t const num_pages; - gpu::PageInfo const* const pages; + PageInfo const* const pages; int const* page_indices; __device__ size_type operator()(size_t index) const @@ -1450,7 +1443,7 @@ struct get_page_nesting_size { auto const& page = pages[page_indices[indices.page_idx]]; if (page.src_col_schema != input_cols[indices.col_idx].schema_idx || - page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + page.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return 0; } @@ -1468,7 +1461,7 @@ struct get_reduction_key { * @brief Writes to the chunk_row field of the PageInfo struct. */ struct chunk_row_output_iter { - gpu::PageInfo* p; + PageInfo* p; using value_type = size_type; using difference_type = size_type; using pointer = size_type*; @@ -1490,7 +1483,7 @@ struct chunk_row_output_iter { * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema. */ struct start_offset_output_iterator { - gpu::PageInfo const* pages; + PageInfo const* pages; int const* page_indices; size_t cur_index; input_col_info const* input_cols; @@ -1529,9 +1522,9 @@ struct start_offset_output_iterator { { auto const indices = reduction_indices{index, max_depth, num_pages}; - gpu::PageInfo const& p = pages[page_indices[indices.page_idx]]; + PageInfo const& p = pages[page_indices[indices.page_idx]]; if (p.src_col_schema != input_cols[indices.col_idx].schema_idx || - p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + p.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return empty; } @@ -1540,15 +1533,15 @@ struct start_offset_output_iterator { }; struct flat_column_num_rows { - gpu::PageInfo const* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo const* pages; + ColumnChunkDesc const* chunks; __device__ size_type operator()(size_type pindex) const { - gpu::PageInfo const& page = pages[pindex]; + PageInfo const& page = pages[pindex]; // ignore dictionary pages and pages belonging to any column containing repetition (lists) - if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) || - (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) { + if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) || + (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) { return 0; } return page.num_rows; @@ -1581,8 +1574,8 @@ struct row_counts_different { * @param expected_row_count Expected row count, if applicable * @param stream CUDA stream used for device memory operations and kernel launches */ -void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, device_span page_keys, device_span page_index, std::optional expected_row_count, @@ -1631,23 +1624,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector& page } struct page_to_string_size { - gpu::PageInfo* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo* pages; + ColumnChunkDesc const* chunks; __device__ size_t operator()(size_type page_idx) const { auto const page = pages[page_idx]; auto const chunk = chunks[page.chunk_idx]; - if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) { - return 0; - } + if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; } return pages[page_idx].str_bytes; } }; struct page_offset_output_iter { - gpu::PageInfo* p; + PageInfo* p; size_type const* index; using value_type = size_type; @@ -1738,7 +1729,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; break; @@ -1749,7 +1740,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // generate string dict indices if necessary { - auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) { + auto is_dict_chunk = [](ColumnChunkDesc const& chunk) { return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; }; @@ -1785,7 +1776,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re if (total_str_dict_indexes > 0) { chunks.host_to_device_async(_stream); - gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); + BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); } } @@ -1800,14 +1791,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // if: // - user has passed custom row bounds // - we will be doing a chunked read - gpu::ComputePageSizes(pages, - chunks, - 0, // 0-max size_t. process all possible rows - std::numeric_limits::max(), - true, // compute num_rows - chunk_read_limit > 0, // compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + 0, // 0-max size_t. process all possible rows + std::numeric_limits::max(), + true, // compute num_rows + chunk_read_limit > 0, // compute string sizes + _pass_itm_data->level_type_size, + _stream); // computes: // PageInfo::chunk_row (the absolute start row index) for all pages @@ -1836,7 +1827,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re _pass_itm_data->output_chunk_read_info = _output_chunk_read_limit > 0 ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream) - : std::vector{{skip_rows, num_rows}}; + : std::vector{{skip_rows, num_rows}}; } void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) @@ -1853,14 +1844,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds // is set (if the user has specified artificial bounds). if (uses_custom_row_bounds) { - gpu::ComputePageSizes(pages, - chunks, - skip_rows, - num_rows, - false, // num_rows is already computed - false, // no need to compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + skip_rows, + num_rows, + false, // num_rows is already computed + false, // no need to compute string sizes + _pass_itm_data->level_type_size, + _stream); // print_pages(pages, _stream); } @@ -1879,7 +1870,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; } @@ -2014,4 +2005,4 @@ std::vector reader::impl::calculate_page_string_offsets() return col_sizes; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 2545a074a38..799d6d9fd64 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -20,7 +20,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { template constexpr int rle_stream_required_run_buffer_size() @@ -362,4 +362,4 @@ struct rle_stream { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a124f352ee4..50589f23626 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -54,12 +54,9 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { -using namespace cudf::io::parquet; -using namespace cudf::io; +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, @@ -185,13 +182,13 @@ namespace { * @param compression The compression type * @return The supported Parquet compression */ -parquet::Compression to_parquet_compression(compression_type compression) +Compression to_parquet_compression(compression_type compression) { switch (compression) { case compression_type::AUTO: - case compression_type::SNAPPY: return parquet::Compression::SNAPPY; - case compression_type::ZSTD: return parquet::Compression::ZSTD; - case compression_type::NONE: return parquet::Compression::UNCOMPRESSED; + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::NONE: return Compression::UNCOMPRESSED; default: CUDF_FAIL("Unsupported compression type"); } } @@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) { for (uint8_t enc = 0; enc < static_cast(Encoding::NUM_ENCODINGS); enc++) { auto const enc_enum = static_cast(enc); - if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } + if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } } } @@ -761,11 +758,11 @@ struct parquet_column_view { std::vector const& schema_tree, rmm::cuda_stream_view stream); - [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; + [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; [[nodiscard]] column_view cudf_column_view() const { return cudf_col; } - [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; } - [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; } + [[nodiscard]] Type physical_type() const { return schema_node.type; } + [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; } std::vector const& get_path_in_schema() { return path_in_schema; } @@ -846,11 +843,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, uint16_t max_rep_level = 0; curr_schema_node = schema_node; while (curr_schema_node.parent_idx != -1) { - if (curr_schema_node.repetition_type == parquet::REPEATED or - curr_schema_node.repetition_type == parquet::OPTIONAL) { + if (curr_schema_node.repetition_type == REPEATED or + curr_schema_node.repetition_type == OPTIONAL) { ++max_def_level; } - if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; } + if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; } curr_schema_node = schema_tree[curr_schema_node.parent_idx]; } CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported"); @@ -897,9 +894,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, } } -gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const +parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const { - auto desc = gpu::parquet_column_device_view{}; // Zero out all fields + auto desc = parquet_column_device_view{}; // Zero out all fields desc.stats_dtype = schema_node.stats_dtype; desc.ts_scale = schema_node.ts_scale; @@ -931,8 +928,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s * @param fragment_size Number of rows per fragment * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, - device_span col_desc, +void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, + device_span col_desc, host_span partitions, device_span part_frag_offset, uint32_t fragment_size, @@ -940,7 +937,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector frag, +void calculate_page_fragments(device_span frag, host_span frag_sizes, rmm::cuda_stream_view stream) { auto d_frag_sz = cudf::detail::make_device_uvector_async( frag_sizes, stream, rmm::mr::get_current_device_resource()); - gpu::CalculatePageFragments(frag, d_frag_sz, stream); + CalculatePageFragments(frag, d_frag_sz, stream); } /** @@ -972,13 +969,13 @@ void calculate_page_fragments(device_span frag, * @param stream CUDA stream used for device memory operations and kernel launches */ void gather_fragment_statistics(device_span frag_stats, - device_span frags, + device_span frags, bool int96_timestamps, rmm::cuda_stream_view stream) { rmm::device_uvector frag_stats_group(frag_stats.size(), stream); - gpu::InitFragmentStatistics(frag_stats_group, frags, stream); + InitFragmentStatistics(frag_stats_group, frags, stream); detail::calculate_group_statistics( frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps); stream.synchronize(); @@ -1008,8 +1005,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); } -auto init_page_sizes(hostdevice_2dvector& chunks, - device_span col_desc, +auto init_page_sizes(hostdevice_2dvector& chunks, + device_span col_desc, uint32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, @@ -1021,19 +1018,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); // Calculate number of pages and store in respective chunks - gpu::InitEncoderPages(chunks, - {}, - {}, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); int num_pages = 0; @@ -1046,19 +1043,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Now that we know the number of pages, allocate an array to hold per page size and get it // populated cudf::detail::hostdevice_vector page_sizes(num_pages, stream); - gpu::InitEncoderPages(chunks, - {}, - page_sizes, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + page_sizes, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); page_sizes.device_to_host_sync(stream); // Get per-page max compressed size @@ -1072,26 +1069,26 @@ auto init_page_sizes(hostdevice_2dvector& chunks, comp_page_sizes.host_to_device_async(stream); // Use per-page max compressed size to calculate chunk.compressed_size - gpu::InitEncoderPages(chunks, - {}, - {}, - comp_page_sizes, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + comp_page_sizes, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); return comp_page_sizes; } size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) { - if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; } + if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; } auto const ncomp_type = to_nvcomp_compression_type(compression); auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) @@ -1104,9 +1101,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) } std::pair>, std::vector>> -build_chunk_dictionaries(hostdevice_2dvector& chunks, - host_span col_desc, - device_2dspan frags, +build_chunk_dictionaries(hostdevice_2dvector& chunks, + host_span col_desc, + device_2dspan frags, Compression compression, dictionary_policy dict_policy, size_t max_dict_size, @@ -1130,7 +1127,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } // Allocate slots for each chunk - std::vector> hash_maps_storage; + std::vector> hash_maps_storage; hash_maps_storage.reserve(h_chunks.size()); for (auto& chunk : h_chunks) { if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN || @@ -1149,8 +1146,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); - gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); - gpu::populate_chunk_hash_maps(frags, stream); + initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); + populate_chunk_hash_maps(frags, stream); chunks.device_to_host_sync(stream); @@ -1197,8 +1194,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - gpu::collect_map_entries(chunks.device_view().flat_view(), stream); - gpu::get_dictionary_indices(frags, stream); + collect_map_entries(chunks.device_view().flat_view(), stream); + get_dictionary_indices(frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } @@ -1221,9 +1218,9 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, * @param write_v2_headers True if version 2 page headers are to be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_encoder_pages(hostdevice_2dvector& chunks, - device_span col_desc, - device_span pages, +void init_encoder_pages(hostdevice_2dvector& chunks, + device_span col_desc, + device_span pages, cudf::detail::hostdevice_vector& comp_page_sizes, statistics_chunk* page_stats, statistics_chunk* frag_stats, @@ -1286,8 +1283,8 @@ void init_encoder_pages(hostdevice_2dvector& chunks, * @param write_v2_headers True if V2 page headers should be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void encode_pages(hostdevice_2dvector& chunks, - device_span pages, +void encode_pages(hostdevice_2dvector& chunks, + device_span pages, uint32_t pages_in_batch, uint32_t first_page_in_batch, uint32_t rowgroups_in_batch, @@ -1308,8 +1305,7 @@ void encode_pages(hostdevice_2dvector& chunks, ? device_span(page_stats + first_page_in_batch, pages_in_batch) : device_span(); - uint32_t max_comp_pages = - (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0; + uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1319,9 +1315,9 @@ void encode_pages(hostdevice_2dvector& chunks, comp_res.end(), compression_result{0, compression_status::FAILURE}); - gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); + EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); switch (compression) { - case parquet::Compression::SNAPPY: + case Compression::SNAPPY: if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { gpu_snap(comp_in, comp_out, comp_res, stream); } else { @@ -1329,7 +1325,7 @@ void encode_pages(hostdevice_2dvector& chunks, nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); } break; - case parquet::Compression::ZSTD: { + case Compression::ZSTD: { if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); reason) { CUDF_FAIL("Compression error: " + reason.value()); @@ -1338,7 +1334,7 @@ void encode_pages(hostdevice_2dvector& chunks, break; } - case parquet::Compression::UNCOMPRESSED: break; + case Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } @@ -1378,7 +1374,7 @@ void encode_pages(hostdevice_2dvector& chunks, * @param column_index_truncate_length maximum length of min or max values in column index, in bytes * @return Computed buffer size needed to encode the column index */ -size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length) +size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length) { // encoding the column index for a given chunk requires: // each list (4 of them) requires 6 bytes of overhead @@ -1499,8 +1495,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::vector this_table_schema(schema_tree.begin(), schema_tree.end()); // Initialize column description - cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), - stream); + cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), + stream); std::transform( parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) { return pcol.get_device_view(stream); @@ -1576,7 +1572,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto d_part_frag_offset = cudf::detail::make_device_uvector_async( part_frag_offset, stream, rmm::mr::get_current_device_resource()); - cudf::detail::hostdevice_2dvector row_group_fragments( + cudf::detail::hostdevice_2dvector row_group_fragments( num_columns, num_fragments, stream); // Create table_device_view so that corresponding column_device_view data @@ -1588,7 +1584,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (num_fragments != 0) { // Move column info to device col_desc.host_to_device_async(stream); - leaf_column_views = create_leaf_column_device_views( + leaf_column_views = create_leaf_column_device_views( col_desc, *parent_column_table_device_view, stream); init_row_group_fragments(row_group_fragments, @@ -1662,7 +1658,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // Initialize row groups and column chunks auto const num_chunks = num_rowgroups * num_columns; - hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); + hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); // total fragments per column (in case they are non-uniform) std::vector frags_per_column(num_columns, 0); @@ -1678,7 +1674,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, row_group.total_byte_size = 0; row_group.columns.resize(num_columns); for (int c = 0; c < num_columns; c++) { - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; ck = {}; ck.col_desc = col_desc.device_ptr() + c; @@ -1700,7 +1696,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, return l + r.num_values; }); ck.plain_data_size = std::accumulate( - chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { + chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) { return sum + frag.fragment_data_size; }); auto& column_chunk_meta = row_group.columns[c].meta_data; @@ -1731,7 +1727,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back(); rmm::device_uvector frag_stats(0, stream); - cudf::detail::hostdevice_vector page_fragments(total_frags, stream); + cudf::detail::hostdevice_vector page_fragments(total_frags, stream); // update fragments and/or prepare for fragment statistics calculation if necessary if (total_frags != 0) { @@ -1749,9 +1745,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const& row_group = agg_meta->file(p).row_groups[global_r]; uint32_t const fragments_in_chunk = util::div_rounding_up_unsafe(row_group.num_rows, frag_size); - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; - ck.fragments = page_fragments.device_ptr(frag_offset); - ck.first_fragment = frag_offset; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + ck.fragments = page_fragments.device_ptr(frag_offset); + ck.first_fragment = frag_offset; // update the chunk pointer here for each fragment in chunk.fragments for (uint32_t i = 0; i < fragments_in_chunk; i++) { @@ -1817,8 +1813,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_t comp_rowgroup_size = 0; if (r < num_rowgroups) { for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; - ck->first_page = num_pages; + EncColumnChunk* ck = &chunks[r][i]; + ck->first_page = num_pages; num_pages += ck->num_pages; pages_in_batch += ck->num_pages; rowgroup_size += ck->bfr_size; @@ -1850,7 +1846,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } // Clear compressed buffer size if compression has been turned off - if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } + if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } // Initialize data pointers in batch uint32_t const num_stats_bfr = @@ -1864,7 +1860,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, stream); rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream); - rmm::device_uvector pages(num_pages, stream); + rmm::device_uvector pages(num_pages, stream); // This contains stats for both the pages and the rowgroups. TODO: make them separate. rmm::device_uvector page_stats(num_stats_bfr, stream); @@ -1874,10 +1870,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto bfr_c = static_cast(comp_bfr.data()); for (auto j = 0; j < batch_list[b]; j++, r++) { for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk& ck = chunks[r][i]; - ck.uncompressed_bfr = bfr; - ck.compressed_bfr = bfr_c; - ck.column_index_blob = bfr_i; + EncColumnChunk& ck = chunks[r][i]; + ck.uncompressed_bfr = bfr; + ck.compressed_bfr = bfr_c; + ck.column_index_blob = bfr_i; bfr += ck.bfr_size; bfr_c += ck.compressed_size; if (stats_granularity == statistics_freq::STATISTICS_COLUMN) { @@ -1960,7 +1956,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (ck.ck_stat_size != 0) { std::vector const stats_blob = cudf::detail::make_std_vector_sync( device_span(dev_bfr, ck.ck_stat_size), stream); - cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); + CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); cp.read(&column_chunk_meta.statistics); need_sync = true; } @@ -2142,8 +2138,8 @@ void writer::impl::write(table_view const& input, std::vector co void writer::impl::write_parquet_data_to_sink( std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -2209,7 +2205,7 @@ void writer::impl::write_parquet_data_to_sink( int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto const& row_group = _agg_meta->file(p).row_groups[global_r]; for (std::size_t i = 0; i < num_columns; i++) { - gpu::EncColumnChunk const& ck = chunks[r][i]; + EncColumnChunk const& ck = chunks[r][i]; auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index @@ -2392,7 +2388,4 @@ std::unique_ptr> writer::merge_row_group_metadata( return std::make_unique>(std::move(output)); } -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 89ef85ba2bd..1d27a8400c8 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -38,15 +38,11 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { +namespace cudf::io::parquet::detail { + // Forward internal classes struct aggregate_writer_metadata; -using namespace cudf::io::parquet; -using namespace cudf::io; using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; using cudf::detail::hostdevice_2dvector; @@ -66,7 +62,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -79,7 +75,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -139,8 +135,8 @@ class writer::impl { * @param[out] bounce_buffer Temporary host output buffer */ void write_parquet_data_to_sink(std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -164,9 +160,10 @@ class writer::impl { bool const _write_v2_headers; int32_t const _column_index_truncate_length; std::vector> const _kv_meta; // Optional user metadata. - single_write_mode const _single_write_mode; // Special parameter only used by `write()` to - // indicate that we are guaranteeing a single table - // write. This enables some internal optimizations. + cudf::io::detail::single_write_mode const + _single_write_mode; // Special parameter only used by `write()` to + // indicate that we are guaranteeing a single table + // write. This enables some internal optimizations. std::vector> const _out_sink; // Internal states, filled during `write()` and written to sink during `write` and `close()`. @@ -180,7 +177,4 @@ class writer::impl { bool _closed = false; // To track if the output has been written to sink. }; -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 73c946a5feb..3e5d7033e60 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -200,29 +200,30 @@ std::unique_ptr make_parquet_list_list_col( // of the file to populate the FileMetaData pointed to by file_meta_data. // throws cudf::logic_error if the file or metadata is invalid. void read_footer(std::unique_ptr const& source, - cudf::io::parquet::FileMetaData* file_meta_data) + cudf::io::parquet::detail::FileMetaData* file_meta_data) { - constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s); - constexpr auto ender_len = sizeof(cudf::io::parquet::file_ender_s); + constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s); + constexpr auto ender_len = sizeof(cudf::io::parquet::detail::file_ender_s); auto const len = source->size(); auto const header_buffer = source->host_read(0, header_len); auto const header = - reinterpret_cast(header_buffer->data()); + reinterpret_cast(header_buffer->data()); auto const ender_buffer = source->host_read(len - ender_len, ender_len); - auto const ender = reinterpret_cast(ender_buffer->data()); + auto const ender = + reinterpret_cast(ender_buffer->data()); // checks for valid header, footer, and file length ASSERT_GT(len, header_len + ender_len); - ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic && - ender->magic == cudf::io::parquet::parquet_magic); + ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic && + ender->magic == cudf::io::parquet::detail::parquet_magic); ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len)); // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1" // seek backwards from the end of the file (footer_length + 8 bytes of ender) auto const footer_buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); - cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); + cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); // returns true on success bool res = cp.read(file_meta_data); @@ -233,14 +234,14 @@ void read_footer(std::unique_ptr const& source, // this assumes the data is uncompressed. // throws cudf::logic_error if the page_loc data is invalid. int read_dict_bits(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); @@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr const& source, // read column index from datasource at location indicated by chunk, // parse and return as a ColumnIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::ColumnIndex read_column_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::ColumnIndex read_column_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index"); CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length"); - cudf::io::parquet::ColumnIndex colidx; + cudf::io::parquet::detail::ColumnIndex colidx; auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); - cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); bool res = cp.read(&colidx); CUDF_EXPECTS(res, "Cannot parse column index"); return colidx; @@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index( // read offset index from datasource at location indicated by chunk, // parse and return as an OffsetIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::OffsetIndex read_offset_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::OffsetIndex read_offset_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index"); CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length"); - cudf::io::parquet::OffsetIndex offidx; + cudf::io::parquet::detail::OffsetIndex offidx; auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); - cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); bool res = cp.read(&offidx); CUDF_EXPECTS(res, "Cannot parse offset index"); return offidx; } // Return as a Statistics from the column chunk -cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::Statistics const& get_statistics( + cudf::io::parquet::detail::ColumnChunk const& chunk) { return chunk.meta_data.statistics; } @@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu // read page header from datasource at location indicated by page_loc, // parse and return as a PageHeader struct. // throws cudf::logic_error if the page_loc data is invalid. -cudf::io::parquet::PageHeader read_page_header(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) +cudf::io::parquet::detail::PageHeader read_page_header( + std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); return page_hdr; @@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_EQ(ph.data_page_header.num_values, page_rows); } @@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_LE(ph.data_page_header.num_values, rows_per_page); } @@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // check that file is written correctly when rows/page < fragment size auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_TRUE(fmd.row_groups.size() > 0); @@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); // there should be only one page since the fragment size is larger than rows_per_page EXPECT_EQ(ph.data_page_header.num_values, num_rows); @@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::BoundaryOrder expected_orders[] = { - cudf::io::parquet::BoundaryOrder::ASCENDING, - cudf::io::parquet::BoundaryOrder::DESCENDING, - cudf::io::parquet::BoundaryOrder::UNORDERED}; + cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { + cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); @@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2) // 1 if v1 > v2. int32_t compare_binary(std::vector const& v1, std::vector const& v2, - cudf::io::parquet::Type ptype, - cudf::io::parquet::ConvertedType ctype) + cudf::io::parquet::detail::Type ptype, + cudf::io::parquet::detail::ConvertedType ctype) { switch (ptype) { - case cudf::io::parquet::INT32: + case cudf::io::parquet::detail::INT32: switch (ctype) { - case cudf::io::parquet::UINT_8: - case cudf::io::parquet::UINT_16: - case cudf::io::parquet::UINT_32: + case cudf::io::parquet::detail::UINT_8: + case cudf::io::parquet::detail::UINT_16: + case cudf::io::parquet::detail::UINT_32: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); default: @@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector const& v1, *(reinterpret_cast(v2.data()))); } - case cudf::io::parquet::INT64: - if (ctype == cudf::io::parquet::UINT_64) { + case cudf::io::parquet::detail::INT64: + if (ctype == cudf::io::parquet::detail::UINT_64) { return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); } return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::FLOAT: + case cudf::io::parquet::detail::FLOAT: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::DOUBLE: + case cudf::io::parquet::detail::DOUBLE: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::BYTE_ARRAY: { + case cudf::io::parquet::detail::BYTE_ARRAY: { int32_t v1sz = v1.size(); int32_t v2sz = v2.size(); int32_t ret = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz)); @@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4164,10 +4169,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndex) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4210,7 +4215,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4255,10 +4260,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4311,7 +4316,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4362,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4403,7 +4408,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4458,9 +4463,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto c0 = testdata::ascending(); @@ -4495,7 +4500,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4542,9 +4547,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto validity2 = cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); @@ -4586,7 +4591,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4616,9 +4621,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; @@ -4711,7 +4716,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4812,7 +4817,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4870,7 +4875,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -5030,10 +5035,10 @@ TEST_F(ParquetReaderTest, NestedByteArray) cudf::io::write_parquet(out_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); std::vector md{ {}, @@ -5081,12 +5086,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats) auto result = cudf::io::read_parquet(in_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY); - EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]); auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]); @@ -5137,9 +5142,9 @@ TEST_F(ParquetReaderTest, StructByteArray) TEST_F(ParquetReaderTest, NestingOptimizationTest) { - // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep. + // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep. constexpr cudf::size_type num_nesting_levels = 16; - static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info); + static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info); constexpr cudf::size_type rows_per_level = 2; constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level; @@ -5206,13 +5211,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5252,13 +5257,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest) // make sure dictionary was not used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5303,13 +5308,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) // make sure dictionary was used as expected. col0 should use one, // col1 should not. auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5354,13 +5359,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest) // make sure dictionary was used for both columns auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5438,13 +5443,13 @@ TEST_P(ParquetSizedTest, DictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -6664,7 +6669,7 @@ TEST_F(ParquetWriterTest, PreserveNullability) TEST_P(ParquetV2Test, CheckEncodings) { - using cudf::io::parquet::Encoding; + using cudf::io::parquet::detail::Encoding; constexpr auto num_rows = 100'000; auto const is_v2 = GetParam(); @@ -6697,7 +6702,7 @@ TEST_P(ParquetV2Test, CheckEncodings) }; auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; From e345620ddaf5d8ac87e2428a84508ecfec2ba4f8 Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 9 Oct 2023 10:13:25 -0700 Subject: [PATCH 12/12] Add stream parameter to List Manipulation and Operations APIs (#14248) I have organized the public List APIs into **three** distinct categories based on their functionality, simplifying the PRs for easier and shorter reviews. This particular PR introduces the `stream` parameter only to the `List Manipulation and Operations APIs`, which fall under `Section 1`. See next comment for other sections. 1. List Manipulation and Operations (`combine.hpp`, `contains.hpp`, `count_elements.hpp`) ``` concatenate_rows concatenate_list_elements contains_nulls contains - search_keys contains - search_key index_of - search_keys index_of - search_key count_elements ``` This PR addresses issues in the following files: 1. **column_wrapper.hpp**: - Corrects the improper passing of the stream value in the `make_lists_column` function. - Enables the missing cast to `lists_column_view`. - Substitutes `copy_bitmask` with `cudf::detail::copy_bitmask` to include the stream parameter. 2. **concatenate.cu:** - Substitutes `create_null_mask` with `cudf::detail::create_null_mask` to include the stream parameter. Authors: - Suraj Aralihalli (https://github.com/SurajAralihalli) - Vyas Ramasubramani (https://github.com/vyasr) - Yunsong Wang (https://github.com/PointKernel) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/14248 --- cpp/include/cudf/lists/combine.hpp | 4 + cpp/include/cudf/lists/contains.hpp | 14 ++- cpp/include/cudf/lists/count_elements.hpp | 2 + cpp/include/cudf_test/column_wrapper.hpp | 45 +++++++--- .../combine/concatenate_list_elements.cu | 3 +- cpp/src/lists/combine/concatenate_rows.cu | 3 +- cpp/src/lists/contains.cu | 37 ++++---- cpp/src/lists/copying/concatenate.cu | 5 +- cpp/src/lists/count_elements.cu | 3 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/lists_test.cpp | 87 +++++++++++++++++++ 11 files changed, 169 insertions(+), 35 deletions(-) create mode 100644 cpp/tests/streams/lists_test.cpp diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp index 0bc76828fc3..0d9c1c157eb 100644 --- a/cpp/include/cudf/lists/combine.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input table. @@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; std::unique_ptr concatenate_rows( table_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -86,6 +88,7 @@ std::unique_ptr concatenate_rows( * @param input The lists column containing lists of list elements to concatenate. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input lists column. @@ -93,6 +96,7 @@ std::unique_ptr concatenate_rows( std::unique_ptr concatenate_list_elements( column_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp index 21c2ca1d64e..7cf67ec9205 100644 --- a/cpp/include/cudf/lists/contains.hpp +++ b/cpp/include/cudf/lists/contains.hpp @@ -42,12 +42,14 @@ namespace lists { * * @param lists Lists column whose `n` rows are to be searched * @param search_key The scalar key to be looked up in each list row + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -63,13 +65,15 @@ std::unique_ptr contains( * 2. The list row `lists[i]` is null * * @param lists Lists column whose `n` rows are to be searched - * @param search_keys Column of elements to be looked up in each list row + * @param search_keys Column of elements to be looked up in each list row. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -84,12 +88,14 @@ std::unique_ptr contains( * A row with an empty list will always return false. * Nulls inside non-null nested elements (such as lists or structs) are not considered. * - * @param lists Lists column whose `n` rows are to be searched + * @param lists Lists column whose `n` rows are to be searched. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains_nulls( cudf::lists_column_view const& lists, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t { * @param search_key The scalar key to be looked up in each list row * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -132,6 +139,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -160,6 +168,7 @@ std::unique_ptr index_of( * `lists` * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -167,6 +176,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp index 552ba058b93..e4bd0dca9ae 100644 --- a/cpp/include/cudf/lists/count_elements.hpp +++ b/cpp/include/cudf/lists/count_elements.hpp @@ -45,11 +45,13 @@ namespace lists { * in the output column. * * @param input Input lists column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with the number of elements for each row */ std::unique_ptr count_elements( lists_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of lists_elements group diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index c0932b81dc3..e94dfea9dcf 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1281,6 +1282,11 @@ class dictionary_column_wrapper : public detail::column_wrapper { template class lists_column_wrapper : public detail::column_wrapper { public: + /** + * @brief Cast to lists_column_view + */ + operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; } + /** * @brief Construct a lists column containing a single list of fixed-width * type from an initializer list of values. @@ -1542,8 +1548,12 @@ class lists_column_wrapper : public detail::column_wrapper { rmm::device_buffer&& null_mask) { // construct the list column - wrapped = make_lists_column( - num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask)); + wrapped = make_lists_column(num_rows, + std::move(offsets), + std::move(values), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1618,8 +1628,12 @@ class lists_column_wrapper : public detail::column_wrapper { }(); // construct the list column - wrapped = make_lists_column( - cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask)); + wrapped = make_lists_column(cols.size(), + std::move(offsets), + std::move(data), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1647,8 +1661,12 @@ class lists_column_wrapper : public detail::column_wrapper { depth = 0; size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1; - wrapped = - make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{}); + wrapped = make_lists_column(num_elements, + std::move(offsets), + std::move(c), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); } /** @@ -1697,12 +1715,15 @@ class lists_column_wrapper : public detail::column_wrapper { } lists_column_view lcv(col); - return make_lists_column(col.size(), - std::make_unique(lcv.offsets()), - normalize_column(lists_column_view(col).child(), - lists_column_view(expected_hierarchy).child()), - col.null_count(), - copy_bitmask(col)); + return make_lists_column( + col.size(), + std::make_unique(lcv.offsets()), + normalize_column(lists_column_view(col).child(), + lists_column_view(expected_hierarchy).child()), + col.null_count(), + cudf::detail::copy_bitmask( + col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + cudf::test::get_default_stream()); } std::pair, std::vector>> preprocess_columns( diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index fbe297765f8..99dbd55678b 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -271,10 +271,11 @@ std::unique_ptr concatenate_list_elements(column_view const& input, */ std::unique_ptr concatenate_list_elements(column_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_list_elements(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu index 658538b0195..49be7b5ff17 100644 --- a/cpp/src/lists/combine/concatenate_rows.cu +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -305,10 +305,11 @@ std::unique_ptr concatenate_rows(table_view const& input, */ std::unique_ptr concatenate_rows(table_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_rows(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index df1d043bdb6..4733a5d63a8 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -287,7 +287,7 @@ std::unique_ptr index_of(lists_column_view const& lists, } auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr); - return index_of(lists, search_key_col->view(), find_option, stream, mr); + return detail::index_of(lists, search_key_col->view(), find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, @@ -306,11 +306,11 @@ std::unique_ptr contains(lists_column_view const& lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto key_indices = index_of(lists, - search_key, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_key, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -322,11 +322,11 @@ std::unique_ptr contains(lists_column_view const& lists, CUDF_EXPECTS(search_keys.size() == lists.size(), "Number of search keys must match list column size."); - auto key_indices = index_of(lists, - search_keys, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_keys, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -364,43 +364,48 @@ std::unique_ptr contains_nulls(lists_column_view const& lists, std::unique_ptr contains(lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_key, cudf::get_default_stream(), mr); + return detail::contains(lists, search_key, stream, mr); } std::unique_ptr contains(lists_column_view const& lists, column_view const& search_keys, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_keys, cudf::get_default_stream(), mr); + return detail::contains(lists, search_keys, stream, mr); } std::unique_ptr contains_nulls(lists_column_view const& lists, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_nulls(lists, cudf::get_default_stream(), mr); + return detail::contains_nulls(lists, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_key, find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, column_view const& search_keys, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_keys, find_option, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu index ddd0dfbe084..5407b88236f 100644 --- a/cpp/src/lists/copying/concatenate.cu +++ b/cpp/src/lists/copying/concatenate.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -123,8 +124,8 @@ std::unique_ptr concatenate(host_span columns, // if any of the input columns have nulls, construct the output mask bool const has_nulls = std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); - rmm::device_buffer null_mask = create_null_mask( - total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED); + rmm::device_buffer null_mask = cudf::detail::create_null_mask( + total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr); auto null_mask_data = static_cast(null_mask.data()); auto const null_count = has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0}; diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index 40a14d805e1..2fd0851067a 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -73,10 +73,11 @@ std::unique_ptr count_elements(lists_column_view const& input, // external APIS std::unique_ptr count_elements(lists_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_elements(input, cudf::get_default_stream(), mr); + return detail::count_elements(input, stream, mr); } } // namespace lists diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ac13c121530..ffaba7d6fa7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -638,6 +638,7 @@ ConfigureTest( ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp new file mode 100644 index 00000000000..e292b551d83 --- /dev/null +++ b/cpp/tests/streams/lists_test.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +class ListTest : public cudf::test::BaseFixture {}; + +TEST_F(ListTest, ConcatenateRows) +{ + cudf::test::lists_column_wrapper list_col_1{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_2{{0, 1}, {2, 3}, {4, 5}}; + cudf::table_view lists_table({list_col_1, list_col_2}); + cudf::lists::concatenate_rows( + lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ConcatenateListElements) +{ + cudf::test::lists_column_wrapper ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}; + cudf::lists::concatenate_list_elements( + ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsNulls) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::index_of(list_col, + search_key, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::index_of(list_col, + search_keys, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, CountElements) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7}, {4, 5}}; + cudf::lists::count_elements(list_col, cudf::test::get_default_stream()); +}