diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ab028eb89cc..dc2c81d1c77 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -28,7 +28,7 @@ concurrency: jobs: cpp-build: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -37,7 +37,7 @@ jobs: python-build: needs: [cpp-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: upload-conda: needs: [cpp-build, python-build] secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -57,7 +57,7 @@ jobs: if: github.ref_type == 'branch' needs: python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: arch: "amd64" branch: ${{ inputs.branch }} @@ -69,7 +69,7 @@ jobs: sha: ${{ inputs.sha }} wheel-build-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: wheel-publish-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} @@ -89,7 +89,7 @@ jobs: wheel-build-dask-cudf: needs: wheel-publish-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: ${{ inputs.build_type || 'branch' }} @@ -100,7 +100,7 @@ jobs: wheel-publish-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@cuda-120-arm with: build_type: ${{ inputs.build_type || 'branch' }} branch: ${{ inputs.branch }} diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 214f9c90b41..047b80f2e5c 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -26,34 +26,34 @@ jobs: - wheel-build-dask-cudf - wheel-tests-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@cuda-120-arm checks: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@cuda-120-arm with: enable_check_generated_files: false conda-cpp-build: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120-arm with: build_type: pull-request conda-cpp-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm with: build_type: pull-request conda-python-build: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120-arm with: build_type: pull-request conda-python-cudf-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: pull-request test_script: "ci/test_python_cudf.sh" @@ -61,14 +61,14 @@ jobs: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: pull-request test_script: "ci/test_python_other.sh" conda-java-tests: needs: conda-cpp-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -78,7 +78,7 @@ jobs: conda-notebook-tests: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -88,7 +88,7 @@ jobs: docs-build: needs: conda-python-build secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: pull-request node_type: "gpu-v100-latest-1" @@ -98,21 +98,21 @@ jobs: wheel-build-cudf: needs: checks secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: build_type: pull-request script: "ci/build_wheel_cudf.sh" wheel-tests-cudf: needs: wheel-build-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-dask-cudf: needs: wheel-tests-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request @@ -120,7 +120,7 @@ jobs: wheel-tests-dask-cudf: needs: wheel-build-dask-cudf secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: pull-request diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 9ca32bcfe03..e58227c30dc 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ on: jobs: conda-cpp-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -24,7 +24,7 @@ jobs: sha: ${{ inputs.sha }} conda-cpp-memcheck-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -36,7 +36,7 @@ jobs: run_script: "ci/test_cpp_memcheck.sh" conda-python-cudf-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -46,7 +46,7 @@ jobs: conda-python-other-tests: # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -55,7 +55,7 @@ jobs: test_script: "ci/test_python_other.sh" conda-java-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -67,7 +67,7 @@ jobs: run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -79,7 +79,7 @@ jobs: run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: build_type: nightly branch: ${{ inputs.branch }} @@ -88,7 +88,7 @@ jobs: script: ci/test_wheel_cudf.sh wheel-tests-dask-cudf: secrets: inherit - uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.12 + uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@cuda-120-arm with: matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1"))) build_type: nightly diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp index 77d6a4d1e89..9df4b4eb00f 100644 --- a/cpp/include/cudf/binaryop.hpp +++ b/cpp/include/cudf/binaryop.hpp @@ -102,6 +102,7 @@ enum class binary_operator : int32_t { * @param rhs The right operand column * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -115,6 +116,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -131,6 +133,7 @@ std::unique_ptr binary_operation( * @param rhs The right operand scalar * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -144,6 +147,7 @@ std::unique_ptr binary_operation( scalar const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -158,6 +162,7 @@ std::unique_ptr binary_operation( * @param rhs The right operand column * @param op The binary operator * @param output_type The desired data type of the output column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -172,6 +177,7 @@ std::unique_ptr binary_operation( column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -189,6 +195,7 @@ std::unique_ptr binary_operation( * @param output_type The desired data type of the output column. It is assumed * that output_type is compatible with the output data type * of the function in the PTX code + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Output column of `output_type` type containing the result of * the binary operation @@ -201,6 +208,7 @@ std::unique_ptr binary_operation( column_view const& rhs, std::string const& ptx, data_type output_type, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp index 9e2b85ea129..eee974c8399 100644 --- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp @@ -169,7 +169,12 @@ class pinned_allocator { * It is the responsibility of the caller to destroy * the objects stored at \p p. */ - __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); } + __host__ inline void deallocate(pointer p, size_type /*cnt*/) + { + auto dealloc_worked = cudaFreeHost(p); + (void)dealloc_worked; + assert(dealloc_worked == cudaSuccess); + } /** * @brief This method returns the maximum size of the \c cnt parameter diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 074f690d2c7..0b8ee9676de 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -38,7 +38,7 @@ class parquet_reader_options; class parquet_writer_options; class chunked_parquet_writer_options; -namespace detail::parquet { +namespace parquet::detail { /** * @brief Class to read Parquet dataset data into columns. @@ -186,7 +186,7 @@ class writer { */ explicit writer(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -201,7 +201,7 @@ class writer { */ explicit writer(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -250,5 +250,5 @@ class writer { * metadata. */ parquet_metadata read_parquet_metadata(host_span const> sources); -} // namespace detail::parquet +} // namespace parquet::detail } // namespace cudf::io diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index deaf23d405a..6283099e700 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -499,7 +499,7 @@ class chunked_parquet_reader { [[nodiscard]] table_with_metadata read_chunk() const; private: - std::unique_ptr reader; + std::unique_ptr reader; }; /** @} */ // end of group @@ -1750,7 +1750,7 @@ class parquet_chunked_writer { std::vector const& column_chunks_file_paths = {}); /// Unique pointer to impl writer class - std::unique_ptr writer; + std::unique_ptr writer; }; /** @} */ // end of group diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp index 0bc76828fc3..0d9c1c157eb 100644 --- a/cpp/include/cudf/lists/combine.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -57,6 +57,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; * @param input Table of lists to be concatenated. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input table. @@ -64,6 +65,7 @@ enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; std::unique_ptr concatenate_rows( table_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -86,6 +88,7 @@ std::unique_ptr concatenate_rows( * @param input The lists column containing lists of list elements to concatenate. * @param null_policy The parameter to specify whether a null list element will be ignored from * concatenation, or any concatenation involving a null element will result in a null list. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return A new column in which each row is a list resulted from concatenating all list elements in * the corresponding row of the input lists column. @@ -93,6 +96,7 @@ std::unique_ptr concatenate_rows( std::unique_ptr concatenate_list_elements( column_view const& input, concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp index 21c2ca1d64e..7cf67ec9205 100644 --- a/cpp/include/cudf/lists/contains.hpp +++ b/cpp/include/cudf/lists/contains.hpp @@ -42,12 +42,14 @@ namespace lists { * * @param lists Lists column whose `n` rows are to be searched * @param search_key The scalar key to be looked up in each list row + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -63,13 +65,15 @@ std::unique_ptr contains( * 2. The list row `lists[i]` is null * * @param lists Lists column whose `n` rows are to be searched - * @param search_keys Column of elements to be looked up in each list row + * @param search_keys Column of elements to be looked up in each list row. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -84,12 +88,14 @@ std::unique_ptr contains( * A row with an empty list will always return false. * Nulls inside non-null nested elements (such as lists or structs) are not considered. * - * @param lists Lists column whose `n` rows are to be searched + * @param lists Lists column whose `n` rows are to be searched. + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory * @return BOOL8 column of `n` rows with the result of the lookup */ std::unique_ptr contains_nulls( cudf::lists_column_view const& lists, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -125,6 +131,7 @@ enum class duplicate_find_option : int32_t { * @param search_key The scalar key to be looked up in each list row * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -132,6 +139,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -160,6 +168,7 @@ std::unique_ptr index_of( * `lists` * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or * last (`FIND_LAST`) + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column of `n` rows with the location of the `search_key` */ @@ -167,6 +176,7 @@ std::unique_ptr index_of( cudf::lists_column_view const& lists, cudf::column_view const& search_keys, duplicate_find_option find_option = duplicate_find_option::FIND_FIRST, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp index 552ba058b93..e4bd0dca9ae 100644 --- a/cpp/include/cudf/lists/count_elements.hpp +++ b/cpp/include/cudf/lists/count_elements.hpp @@ -45,11 +45,13 @@ namespace lists { * in the output column. * * @param input Input lists column + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with the number of elements for each row */ std::unique_ptr count_elements( lists_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of lists_elements group diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index c0932b81dc3..e94dfea9dcf 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1281,6 +1282,11 @@ class dictionary_column_wrapper : public detail::column_wrapper { template class lists_column_wrapper : public detail::column_wrapper { public: + /** + * @brief Cast to lists_column_view + */ + operator lists_column_view() const { return cudf::lists_column_view{wrapped->view()}; } + /** * @brief Construct a lists column containing a single list of fixed-width * type from an initializer list of values. @@ -1542,8 +1548,12 @@ class lists_column_wrapper : public detail::column_wrapper { rmm::device_buffer&& null_mask) { // construct the list column - wrapped = make_lists_column( - num_rows, std::move(offsets), std::move(values), null_count, std::move(null_mask)); + wrapped = make_lists_column(num_rows, + std::move(offsets), + std::move(values), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1618,8 +1628,12 @@ class lists_column_wrapper : public detail::column_wrapper { }(); // construct the list column - wrapped = make_lists_column( - cols.size(), std::move(offsets), std::move(data), null_count, std::move(null_mask)); + wrapped = make_lists_column(cols.size(), + std::move(offsets), + std::move(data), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } /** @@ -1647,8 +1661,12 @@ class lists_column_wrapper : public detail::column_wrapper { depth = 0; size_type num_elements = offsets->size() == 0 ? 0 : offsets->size() - 1; - wrapped = - make_lists_column(num_elements, std::move(offsets), std::move(c), 0, rmm::device_buffer{}); + wrapped = make_lists_column(num_elements, + std::move(offsets), + std::move(c), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); } /** @@ -1697,12 +1715,15 @@ class lists_column_wrapper : public detail::column_wrapper { } lists_column_view lcv(col); - return make_lists_column(col.size(), - std::make_unique(lcv.offsets()), - normalize_column(lists_column_view(col).child(), - lists_column_view(expected_hierarchy).child()), - col.null_count(), - copy_bitmask(col)); + return make_lists_column( + col.size(), + std::make_unique(lcv.offsets()), + normalize_column(lists_column_view(col).child(), + lists_column_view(expected_hierarchy).child()), + col.null_count(), + cudf::detail::copy_bitmask( + col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), + cudf::test::get_default_stream()); } std::pair, std::vector>> preprocess_columns( diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp index ef07de8c461..6b413ab2be4 100644 --- a/cpp/src/binaryop/binaryop.cpp +++ b/cpp/src/binaryop/binaryop.cpp @@ -405,38 +405,42 @@ std::unique_ptr binary_operation(scalar const& lhs, column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, scalar const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, binary_operator op, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, op, output_type, stream, mr); } std::unique_ptr binary_operation(column_view const& lhs, column_view const& rhs, std::string const& ptx, data_type output_type, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr); + return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr); } } // namespace cudf diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 1f7f342632a..85ab5c6d6cb 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -47,14 +47,16 @@ namespace { struct scalar_as_column_view { using return_type = typename std::pair>; template ())> - return_type operator()(scalar const& s, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) + return_type operator()(scalar const& s, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource*) { auto& h_scalar_type_view = static_cast&>(const_cast(s)); auto col_v = column_view(s.type(), 1, h_scalar_type_view.data(), reinterpret_cast(s.validity_data()), - !s.is_valid()); + !s.is_valid(stream)); return std::pair{col_v, std::unique_ptr(nullptr)}; } template ())> diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh deleted file mode 100644 index 183042fc0f4..00000000000 --- a/cpp/src/hash/unordered_multiset.cuh +++ /dev/null @@ -1,159 +0,0 @@ -/* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include - -namespace cudf { -namespace detail { -/* - * Device view of the unordered multiset - */ -template , - typename Equality = equal_to> -class unordered_multiset_device_view { - public: - unordered_multiset_device_view(size_type hash_size, - size_type const* hash_begin, - Element const* hash_data) - : hash_size{hash_size}, hash_begin{hash_begin}, hash_data{hash_data}, hasher(), equals() - { - } - - bool __device__ contains(Element e) const - { - size_type loc = hasher(e) % (2 * hash_size); - - for (size_type i = hash_begin[loc]; i < hash_begin[loc + 1]; ++i) { - if (equals(hash_data[i], e)) return true; - } - - return false; - } - - private: - Hasher hasher; - Equality equals; - size_type hash_size; - size_type const* hash_begin; - Element const* hash_data; -}; - -/* - * Fixed size set on a device. - */ -template , - typename Equality = equal_to> -class unordered_multiset { - public: - /** - * @brief Factory to construct a new unordered_multiset - */ - static unordered_multiset create(column_view const& col, rmm::cuda_stream_view stream) - { - auto d_column = column_device_view::create(col, stream); - auto d_col = *d_column; - - auto hash_bins_start = cudf::detail::make_zeroed_device_uvector_async( - 2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource()); - auto hash_bins_end = cudf::detail::make_zeroed_device_uvector_async( - 2 * d_col.size() + 1, stream, rmm::mr::get_current_device_resource()); - auto hash_data = rmm::device_uvector(d_col.size(), stream); - - Hasher hasher; - size_type* d_hash_bins_start = hash_bins_start.data(); - size_type* d_hash_bins_end = hash_bins_end.data(); - Element* d_hash_data = hash_data.data(); - - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_start, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - cuda::atomic_ref ref{*(d_hash_bins_start + tmp)}; - ref.fetch_add(1, cuda::std::memory_order_relaxed); - } - }); - - thrust::exclusive_scan(rmm::exec_policy(stream), - hash_bins_start.begin(), - hash_bins_start.end(), - hash_bins_end.begin()); - - thrust::copy(rmm::exec_policy(stream), - hash_bins_end.begin(), - hash_bins_end.end(), - hash_bins_start.begin()); - - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - cuda::atomic_ref ref{*(d_hash_bins_end + tmp)}; - size_type offset = ref.fetch_add(1, cuda::std::memory_order_relaxed); - d_hash_data[offset] = e; - } - }); - - return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data)); - } - - unordered_multiset_device_view to_device() const - { - return unordered_multiset_device_view( - size, hash_bins.data(), hash_data.data()); - } - - private: - unordered_multiset(size_type size, - rmm::device_uvector&& hash_bins, - rmm::device_uvector&& hash_data) - : size{size}, hash_bins{std::move(hash_bins)}, hash_data{std::move(hash_data)} - { - } - - size_type size; - rmm::device_uvector hash_bins; - rmm::device_uvector hash_data; -}; - -} // namespace detail -} // namespace cudf diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 392a7850886..726442d752e 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -470,8 +470,8 @@ void orc_chunked_writer::close() writer->close(); } -using namespace cudf::io::detail::parquet; -namespace detail_parquet = cudf::io::detail::parquet; +using namespace cudf::io::parquet::detail; +namespace detail_parquet = cudf::io::parquet::detail; table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 95f1db5bfd1..479a2dfada3 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -76,8 +76,8 @@ __global__ void __launch_bounds__(block_size, 1) { using block_scan = cub::BlockScan; __shared__ typename block_scan::TempStorage temp_storage; - volatile uint32_t stats_size = 0; - auto t = threadIdx.x; + uint32_t stats_size = 0; + auto t = threadIdx.x; __syncthreads(); for (thread_index_type start = 0; start < statistics_count; start += block_size) { uint32_t stats_len = 0, stats_pos; diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 3edcd3d83b2..0b249bbdafe 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -142,9 +142,7 @@ struct orcdec_state_s { * @param[in] base Pointer to raw byte stream data * @param[in] len Stream length in bytes */ -static __device__ void bytestream_init(volatile orc_bytestream_s* bs, - uint8_t const* base, - uint32_t len) +static __device__ void bytestream_init(orc_bytestream_s* bs, uint8_t const* base, uint32_t len) { uint32_t pos = (len > 0) ? static_cast(7 & reinterpret_cast(base)) : 0; bs->base = base - pos; @@ -160,8 +158,7 @@ static __device__ void bytestream_init(volatile orc_bytestream_s* bs, * @param[in] bs Byte stream input * @param[in] bytes_consumed Number of bytes that were consumed */ -static __device__ void bytestream_flush_bytes(volatile orc_bytestream_s* bs, - uint32_t bytes_consumed) +static __device__ void bytestream_flush_bytes(orc_bytestream_s* bs, uint32_t bytes_consumed) { uint32_t pos = bs->pos; uint32_t len = bs->len; @@ -197,7 +194,7 @@ static __device__ void bytestream_fill(orc_bytestream_s* bs, int t) * @param[in] pos Position in byte stream * @return byte */ -inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint8_t bytestream_readbyte(orc_bytestream_s* bs, int pos) { return bs->buf.u8[pos & (bytestream_buffer_size - 1)]; } @@ -209,7 +206,7 @@ inline __device__ uint8_t bytestream_readbyte(volatile orc_bytestream_s* bs, int * @param[in] pos Position in byte stream * @result bits */ -inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint32_t bytestream_readu32(orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -224,7 +221,7 @@ inline __device__ uint32_t bytestream_readu32(volatile orc_bytestream_s* bs, int * @param[in] numbits number of bits * @return bits */ -inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint64_t bytestream_readu64(orc_bytestream_s* bs, int pos) { uint32_t a = bs->buf.u32[(pos & (bytestream_buffer_size - 1)) >> 2]; uint32_t b = bs->buf.u32[((pos + 4) & (bytestream_buffer_size - 1)) >> 2]; @@ -245,9 +242,7 @@ inline __device__ uint64_t bytestream_readu64(volatile orc_bytestream_s* bs, int * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs, - int bitpos, - uint32_t numbits) +inline __device__ uint32_t bytestream_readbits(orc_bytestream_s* bs, int bitpos, uint32_t numbits) { int idx = bitpos >> 5; uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123); @@ -263,9 +258,7 @@ inline __device__ uint32_t bytestream_readbits(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @return decoded value */ -inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs, - int bitpos, - uint32_t numbits) +inline __device__ uint64_t bytestream_readbits64(orc_bytestream_s* bs, int bitpos, uint32_t numbits) { int idx = bitpos >> 5; uint32_t a = __byte_perm(bs->buf.u32[(idx + 0) & bytestream_buffer_mask], 0, 0x0123); @@ -288,7 +281,7 @@ inline __device__ uint64_t bytestream_readbits64(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, uint32_t& result) @@ -304,7 +297,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, int32_t& result) @@ -321,7 +314,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, uint64_t& result) @@ -337,7 +330,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @param[in] numbits number of bits * @param[out] result decoded value */ -inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, +inline __device__ void bytestream_readbe(orc_bytestream_s* bs, int bitpos, uint32_t numbits, int64_t& result) @@ -354,7 +347,7 @@ inline __device__ void bytestream_readbe(volatile orc_bytestream_s* bs, * @return length of varint in bytes */ template -inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) +inline __device__ uint32_t varint_length(orc_bytestream_s* bs, int pos) { if (bytestream_readbyte(bs, pos) > 0x7f) { uint32_t next32 = bytestream_readu32(bs, pos + 1); @@ -392,7 +385,7 @@ inline __device__ uint32_t varint_length(volatile orc_bytestream_s* bs, int pos) * @return new position in byte stream buffer */ template -inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int pos, T& result) +inline __device__ int decode_base128_varint(orc_bytestream_s* bs, int pos, T& result) { uint32_t v = bytestream_readbyte(bs, pos++); if (v > 0x7f) { @@ -446,7 +439,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p /** * @brief Decodes a signed int128 encoded as base-128 varint (used for decimals) */ -inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int pos) +inline __device__ __int128_t decode_varint128(orc_bytestream_s* bs, int pos) { auto byte = bytestream_readbyte(bs, pos++); __int128_t const sign_mask = -(int32_t)(byte & 1); @@ -463,7 +456,7 @@ inline __device__ __int128_t decode_varint128(volatile orc_bytestream_s* bs, int /** * @brief Decodes an unsigned 32-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint32_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -474,7 +467,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint /** * @brief Decodes an unsigned 64-bit varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint64_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, uint64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -485,7 +478,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, uint /** * @brief Signed version of 32-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int32_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int32_t& result) { uint32_t u; pos = decode_base128_varint(bs, pos, u); @@ -496,7 +489,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int3 /** * @brief Signed version of 64-bit decode_varint */ -inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int64_t& result) +inline __device__ int decode_varint(orc_bytestream_s* bs, int pos, int64_t& result) { uint64_t u; pos = decode_base128_varint(bs, pos, u); @@ -514,7 +507,7 @@ inline __device__ int decode_varint(volatile orc_bytestream_s* bs, int pos, int6 * @return number of values decoded */ template -inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -534,8 +527,8 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, * @return number of values decoded */ template -static __device__ uint32_t Integer_RLEv1( - orc_bytestream_s* bs, volatile orc_rlev1_state_s* rle, volatile T* vals, uint32_t maxvals, int t) +static __device__ uint32_t +Integer_RLEv1(orc_bytestream_s* bs, orc_rlev1_state_s* rle, T* vals, uint32_t maxvals, int t) { uint32_t numvals, numruns; if (t == 0) { @@ -642,8 +635,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { */ template static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, - volatile orc_rlev2_state_s* rle, - volatile T* vals, + orc_rlev2_state_s* rle, + T* vals, uint32_t maxvals, int t, bool has_buffered_values = false) @@ -883,7 +876,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, * * @return 32-bit value */ -inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bitpos) +inline __device__ uint32_t rle8_read_bool32(uint32_t* vals, uint32_t bitpos) { uint32_t a = vals[(bitpos >> 5) + 0]; uint32_t b = vals[(bitpos >> 5) + 1]; @@ -903,11 +896,8 @@ inline __device__ uint32_t rle8_read_bool32(volatile uint32_t* vals, uint32_t bi * * @return number of values decoded */ -static __device__ uint32_t Byte_RLE(orc_bytestream_s* bs, - volatile orc_byterle_state_s* rle, - volatile uint8_t* vals, - uint32_t maxvals, - int t) +static __device__ uint32_t +Byte_RLE(orc_bytestream_s* bs, orc_byterle_state_s* rle, uint8_t* vals, uint32_t maxvals, int t) { uint32_t numvals, numruns; int r, tr; @@ -1006,8 +996,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1, * @return number of values decoded */ static __device__ int Decode_Decimals(orc_bytestream_s* bs, - volatile orc_byterle_state_s* scratch, - volatile orcdec_state_s::values& vals, + orc_byterle_state_s* scratch, + orcdec_state_s::values& vals, int val_scale, int numvals, type_id dtype_id, @@ -1241,8 +1231,8 @@ __global__ void __launch_bounds__(block_size) } __syncthreads(); while (s->top.dict.dict_len > 0) { - uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len; - volatile uint32_t* vals = s->vals.u32; + uint32_t numvals = min(s->top.dict.dict_len, blockDim.x), len; + uint32_t* vals = s->vals.u32; bytestream_fill(&s->bs, t); __syncthreads(); if (is_rlev1(s->chunk.encoding_kind)) { @@ -1310,12 +1300,12 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s, min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x)); if (s->chunk.valid_map_base != nullptr) { // We have a present stream - uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); - auto r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); - uint32_t valid = (t < nrows && r < rmax) - ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 - : 0; - volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; + uint32_t rmax = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row); + auto r = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row); + uint32_t valid = (t < nrows && r < rmax) + ? (((uint8_t const*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1 + : 0; + auto* row_ofs_plus1 = (uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count]; uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row; if (t < nrows) { row_ofs_plus1[t] = valid; } lengths_to_positions(row_ofs_plus1, nrows, t); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 73c41e2bbcd..4841fb1141a 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -53,7 +53,7 @@ constexpr bool zero_pll_war = true; struct byterle_enc_state_s { uint32_t literal_run; uint32_t repeat_run; - volatile uint32_t rpt_map[(512 / 32) + 1]; + uint32_t rpt_map[(512 / 32) + 1]; }; struct intrle_enc_state_s { @@ -63,7 +63,7 @@ struct intrle_enc_state_s { uint32_t literal_w; uint32_t hdr_bytes; uint32_t pl_bytes; - volatile uint32_t delta_map[(512 / 32) + 1]; + uint32_t delta_map[(512 / 32) + 1]; }; struct strdata_enc_state_s { @@ -366,7 +366,7 @@ static __device__ uint32_t IntegerRLE( using block_reduce = cub::BlockReduce; uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; uint32_t out_cnt = 0; - __shared__ volatile uint64_t block_vmin; + __shared__ uint64_t block_vmin; while (numvals > 0) { T v0 = (t < numvals) ? inbuf[(inpos + t) & inmask] : 0; @@ -615,7 +615,7 @@ static __device__ void StoreStringData(uint8_t* dst, * @param[in] t thread id */ template -inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals, unsigned int t) +inline __device__ void lengths_to_positions(T* vals, uint32_t numvals, unsigned int t) { for (uint32_t n = 1; n < numvals; n <<= 1) { __syncthreads(); @@ -1143,7 +1143,7 @@ __global__ void __launch_bounds__(256) uint32_t comp_block_align) { __shared__ __align__(16) StripeStream ss; - __shared__ uint8_t* volatile uncomp_base_g; + __shared__ uint8_t* uncomp_base_g; auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align); auto const padded_comp_block_size = util::round_up_unsafe(max_comp_blk_size, comp_block_align); @@ -1196,8 +1196,8 @@ __global__ void __launch_bounds__(1024) uint32_t max_comp_blk_size) { __shared__ __align__(16) StripeStream ss; - __shared__ uint8_t const* volatile comp_src_g; - __shared__ uint32_t volatile comp_len_g; + __shared__ uint8_t const* comp_src_g; + __shared__ uint32_t comp_len_g; auto const stripe_id = blockIdx.x; auto const stream_id = blockIdx.y; diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 8eeca504121..b31a4a081d1 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -499,7 +499,7 @@ __global__ void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_gr : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row; for (int j = t4; j < rowgroup_size4; j += 4) { ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] = - ((volatile uint32_t*)&s->rowgroups[i])[j]; + ((uint32_t*)&s->rowgroups[i])[j]; } row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows; // Updating in case of struct diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 9ff1869edde..53ff31ab0a7 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -24,10 +24,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } @@ -101,7 +99,7 @@ struct map_find_fn { template __global__ void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) + populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -226,7 +224,7 @@ __global__ void __launch_bounds__(block_size) template __global__ void __launch_bounds__(block_size) - get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) + get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -276,7 +274,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st <<>>(chunks); } -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); @@ -290,14 +288,11 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi collect_map_entries_kernel<<>>(chunks); } -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel <<>>(frags); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 5c7b8ca3f8c..81d1be64a45 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -21,9 +21,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Base class for parquet field functors. @@ -870,6 +868,4 @@ int CompactProtocolReader::WalkSchema( } } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 619815db503..cbb4161b138 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -25,9 +25,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata * @@ -147,6 +146,4 @@ class CompactProtocolReader { friend class parquet_field_struct_blob; }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 60bc8984d81..9adc8767880 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -16,9 +16,7 @@ #include "compact_protocol_writer.hpp" -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Parquet CompactProtocolWriter class @@ -391,6 +389,4 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field) current_field_value = field; } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 26d66527aa5..4849a814b14 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -25,9 +25,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata @@ -115,6 +113,4 @@ class CompactProtocolFieldWriter { inline void set_current_field(int const& field); }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 8de3702bc2e..544c93ee616 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -411,7 +408,4 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 2382e4aafdf..a513e6674b4 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -18,7 +18,7 @@ #include "page_decode.cuh" -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // DELTA_XXX encoding support // @@ -291,4 +291,4 @@ struct delta_binary_decoder { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 230834632dd..cce3659b902 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -23,10 +23,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -624,7 +621,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector } /** - * @copydoc cudf::io::parquet::gpu::DecodePageData + * @copydoc cudf::io::parquet::detail::DecodePageData */ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -648,7 +645,4 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index cdc29197eb3..7c866fd8b9e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -24,7 +24,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { struct page_state_s { constexpr page_state_s() noexcept {} @@ -753,7 +753,7 @@ __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_input_value // for nested schemas, it's more complicated. This warp will visit 32 incoming values, // however not all of them will necessarily represent a value at this nesting level. so // the validity bit for thread t might actually represent output value t-6. the correct - // position for thread t's bit is cur_value_count. for cuda 11 we could use + // position for thread t's bit is thread_value_count. for cuda 11 we could use // __reduce_or_sync(), but until then we have to do a warp reduce. WarpReduceOr32(is_valid << thread_value_count); @@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, return true; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 2b78dead205..d25684a59f3 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -23,7 +23,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { namespace { @@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary + * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary */ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fe0dbb85124..78873d5e8ca 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -41,10 +41,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -329,7 +326,7 @@ __global__ void __launch_bounds__(128) // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -998,7 +995,7 @@ __device__ auto julian_days_with_time(int64_t v) // blockDim(128, 1, 1) template __global__ void __launch_bounds__(128, 8) - gpuEncodePages(device_span pages, + gpuEncodePages(device_span pages, device_span> comp_in, device_span> comp_out, device_span comp_results, @@ -1988,7 +1985,7 @@ __global__ void __launch_bounds__(128) // blockDim(1024, 1, 1) __global__ void __launch_bounds__(1024) - gpuGatherPages(device_span chunks, device_span pages) + gpuGatherPages(device_span chunks, device_span pages) { __shared__ __align__(8) EncColumnChunk ck_g; __shared__ __align__(8) EncPage page_g; @@ -2265,7 +2262,7 @@ void InitFragmentStatistics(device_span groups, } void InitEncoderPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -2294,7 +2291,7 @@ void InitEncoderPages(device_2dspan chunks, write_v2_headers); } -void EncodePages(device_span pages, +void EncodePages(device_span pages, bool write_v2_headers, device_span> comp_in, device_span> comp_out, @@ -2328,7 +2325,7 @@ void EncodePageHeaders(device_span pages, } void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream) { gpuGatherPages<<>>(chunks, pages); @@ -2343,7 +2340,4 @@ void EncodeColumnIndexes(device_span chunks, chunks, column_stats, column_index_truncate_length); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 6f8b2f50443..eae8e05e61e 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -20,10 +20,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { + // Minimal thrift implementation for parsing page headers // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md @@ -161,8 +159,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) * @param chunk Column chunk the page belongs to * @return `kernel_mask_bits` value for the given page */ -__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, - gpu::ColumnChunkDesc const& chunk) +__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk) { if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } @@ -528,7 +525,4 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, gpuBuildStringDictionaryIndex<<>>(chunks, num_chunks); } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d79abe4a6d2..4d79770ec34 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -20,10 +20,7 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { -namespace gpu { +namespace cudf::io::parquet::detail { namespace { @@ -757,7 +754,7 @@ __global__ void __launch_bounds__(decode_block_size) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes + * @copydoc cudf::io::parquet::detail::ComputePageStringSizes */ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -778,7 +775,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, } /** - * @copydoc cudf::io::parquet::gpu::DecodeStringPageData + * @copydoc cudf::io::parquet::detail::DecodeStringPageData */ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -802,7 +799,4 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pa } } -} // namespace gpu -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index 9395599b3ff..a81d0a64466 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -18,7 +18,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. // copies from src to dst in 16B chunks per thread. @@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index c2affc774c2..c5993d73dec 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -25,9 +25,8 @@ #include #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24)); /** @@ -206,7 +205,7 @@ struct SchemaElement { { return type == UNDEFINED_TYPE && // this assumption might be a little weak. - ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children == 2)); + ((repetition_type != REPEATED) || (repetition_type == REPEATED && num_children > 1)); } }; @@ -405,6 +404,4 @@ static inline int CountLeadingZeros32(uint32_t value) #endif } -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5a1716bb547..50736197eb9 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -18,9 +18,8 @@ #include -namespace cudf { -namespace io { -namespace parquet { +namespace cudf::io::parquet::detail { + // Max decimal precisions according to the parquet spec: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal auto constexpr MAX_DECIMAL32_PRECISION = 9; @@ -156,6 +155,4 @@ enum FieldType { ST_FLD_STRUCT = 12, }; -} // namespace parquet -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index dc74bee1536..10e12ebb782 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -23,7 +23,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; @@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx, return idx; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 51c862b376b..767668cc65e 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -35,7 +35,7 @@ #include -namespace cudf::io::parquet { +namespace cudf::io::parquet::detail { using cudf::io::detail::string_index_pair; @@ -88,8 +88,6 @@ struct input_column_info { auto nesting_depth() const { return nesting.size(); } }; -namespace gpu { - /** * @brief Enums for the flags in the page header */ @@ -347,7 +345,7 @@ struct file_intermediate_data { // all chunks from the selected row groups. We may end up reading these chunks progressively // instead of all at once - std::vector chunks{}; + std::vector chunks{}; // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we // may not be visiting every row group that contains these bounds @@ -372,16 +370,16 @@ struct pass_intermediate_data { // rowgroup, chunk and page information for the current pass. std::vector row_groups{}; - cudf::detail::hostdevice_vector chunks{}; - cudf::detail::hostdevice_vector pages_info{}; - cudf::detail::hostdevice_vector page_nesting_info{}; - cudf::detail::hostdevice_vector page_nesting_decode_info{}; + cudf::detail::hostdevice_vector chunks{}; + cudf::detail::hostdevice_vector pages_info{}; + cudf::detail::hostdevice_vector page_nesting_info{}; + cudf::detail::hostdevice_vector page_nesting_decode_info{}; rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; rmm::device_uvector page_index{0, rmm::cuda_stream_default}; rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; - std::vector output_chunk_read_info; + std::vector output_chunk_read_info; std::size_t current_output_chunk{0}; rmm::device_buffer level_decode_data{}; @@ -739,7 +737,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -762,7 +760,7 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -781,7 +779,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan * @param[in] stream CUDA stream to use */ void InitEncoderPages(cudf::detail::device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -847,7 +845,7 @@ void EncodePageHeaders(device_span pages, * @param[in] stream CUDA stream to use */ void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream); /** @@ -863,5 +861,4 @@ void EncodeColumnIndexes(device_span chunks, int32_t column_index_truncate_length, rmm::cuda_stream_view stream); -} // namespace gpu -} // namespace cudf::io::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 805d082c71e..9083be1c2dd 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { /** @@ -62,13 +62,13 @@ struct stats_caster { // uses storage type as T template () or cudf::is_nested())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_FAIL("unsupported type for stats casting"); } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); return targetType(*reinterpret_cast(stats_val)); @@ -78,7 +78,7 @@ struct stats_caster { template () and !cudf::is_boolean()) or cudf::is_fixed_point() or cudf::is_chrono())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case INT32: return targetType(*reinterpret_cast(stats_val)); @@ -103,7 +103,7 @@ struct stats_caster { } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case FLOAT: return targetType(*reinterpret_cast(stats_val)); @@ -113,7 +113,7 @@ struct stats_caster { } template )> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) { switch (type) { case BYTE_ARRAY: [[fallthrough]]; @@ -527,4 +527,4 @@ named_to_reference_converter::visit_operands( return transformed_operands; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index 1e87447006d..17d7c07bc91 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,7 +16,7 @@ #include "reader_impl.hpp" -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { reader::reader() = default; @@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); } table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index ea40f29a070..26ec83d5946 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -25,7 +25,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) { @@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); size_t const sum_max_depths = std::accumulate( - chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { + chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) { return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); @@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // doing a gather operation later on. // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). - auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0; + auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0; std::vector col_sizes(_input_columns.size(), 0L); if (has_strings) { - gpu::ComputePageStringSizes( + ComputePageStringSizes( pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream); col_sizes = calculate_page_string_offsets(); @@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) if (has_strings) { auto& stream = streams[s_idx++]; chunk_nested_str_data.host_to_device_async(stream); - gpu::DecodeStringPageData( + DecodeStringPageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream); } // launch delta binary decoder - if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) { - gpu::DecodeDeltaBinary( + if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) { + DecodeDeltaBinary( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } // launch the catch-all page decoder - if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) { - gpu::DecodePageData( + if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) { + DecodePageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } @@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // update null counts in the final column buffers for (size_t idx = 0; idx < pages.size(); idx++) { - gpu::PageInfo* pi = &pages[idx]; - if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } - gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; + PageInfo* pi = &pages[idx]; + if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } + ColumnChunkDesc* col = &chunks[pi->chunk_idx]; input_column_info const& input_col = _input_columns[col->src_col_index]; - int index = pi->nesting_decode - page_nesting_decode.device_ptr(); - gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; + int index = pi->nesting_decode - page_nesting_decode.device_ptr(); + PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; auto* cols = &_output_buffers; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { @@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit, // Save the states of the output buffers for reuse in `chunk_read()`. for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -368,7 +368,7 @@ void reader::impl::prepare_data(int64_t skip_rows, // always create the pass struct, even if we end up with no passes. // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() && not _input_columns.empty() && _current_input_pass < num_passes) { @@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk() if (_chunk_count > 0) { _output_buffers.resize(0); for (auto const& buff : _output_buffers_template) { - _output_buffers.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span con metadata.get_key_value_metadata()[0]}; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 9445e4d1648..6003b931b04 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { /** * @brief Implementation for Parquet reader @@ -261,10 +261,10 @@ class reader::impl { std::vector _input_columns; // Buffers for generating output columns - std::vector _output_buffers; + std::vector _output_buffers; // Buffers copied from `_output_buffers` after construction for reuse - std::vector _output_buffers_template; + std::vector _output_buffers_template; // _output_buffers associated schema indices std::vector _output_column_schemas; @@ -285,8 +285,8 @@ class reader::impl { // Within a pass, we produce one or more chunks of output, whose maximum total // byte size is controlled by _output_chunk_read_limit. - cudf::io::parquet::gpu::file_intermediate_data _file_itm_data; - std::unique_ptr _pass_itm_data; + file_intermediate_data _file_itm_data; + std::unique_ptr _pass_itm_data; // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents // the start/end of the chunks to be loaded for a given pass. @@ -301,4 +301,4 @@ class reader::impl { bool _file_preprocessed{false}; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index fcaa610fbb7..171cf07da3e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -21,34 +21,34 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { ConvertedType logical_type_to_converted_type(LogicalType const& logical) { if (logical.isset.STRING) { - return parquet::UTF8; + return UTF8; } else if (logical.isset.MAP) { - return parquet::MAP; + return MAP; } else if (logical.isset.LIST) { - return parquet::LIST; + return LIST; } else if (logical.isset.ENUM) { - return parquet::ENUM; + return ENUM; } else if (logical.isset.DECIMAL) { - return parquet::DECIMAL; // TODO set decimal values + return DECIMAL; // TODO set decimal values } else if (logical.isset.DATE) { - return parquet::DATE; + return DATE; } else if (logical.isset.TIME) { if (logical.TIME.unit.isset.MILLIS) - return parquet::TIME_MILLIS; + return TIME_MILLIS; else if (logical.TIME.unit.isset.MICROS) - return parquet::TIME_MICROS; + return TIME_MICROS; } else if (logical.isset.TIMESTAMP) { if (logical.TIMESTAMP.unit.isset.MILLIS) - return parquet::TIMESTAMP_MILLIS; + return TIMESTAMP_MILLIS; else if (logical.TIMESTAMP.unit.isset.MICROS) - return parquet::TIMESTAMP_MICROS; + return TIMESTAMP_MICROS; } else if (logical.isset.INTEGER) { switch (logical.INTEGER.bitWidth) { case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; @@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical) default: break; } } else if (logical.isset.UNKNOWN) { - return parquet::NA; + return NA; } else if (logical.isset.JSON) { - return parquet::JSON; + return JSON; } else if (logical.isset.BSON) { - return parquet::BSON; + return BSON; } - return parquet::UNKNOWN; + return UNKNOWN; } } // namespace @@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - parquet::Type const physical = schema.type; - parquet::LogicalType const logical_type = schema.logical_type; - parquet::ConvertedType converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; + Type const physical = schema.type; + LogicalType const logical_type = schema.logical_type; + ConvertedType converted_type = schema.converted_type; + int32_t decimal_precision = schema.decimal_precision; // Logical type used for actual data interpretation; the legacy converted type // is superseded by 'logical' type whenever available. auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == parquet::DECIMAL) { + if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } + if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type.DECIMAL.precision; } switch (converted_type) { - case parquet::UINT_8: return type_id::UINT8; - case parquet::INT_8: return type_id::INT8; - case parquet::UINT_16: return type_id::UINT16; - case parquet::INT_16: return type_id::INT16; - case parquet::UINT_32: return type_id::UINT32; - case parquet::UINT_64: return type_id::UINT64; - case parquet::DATE: return type_id::TIMESTAMP_DAYS; - case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS; - case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS; - case parquet::TIMESTAMP_MILLIS: + case UINT_8: return type_id::UINT8; + case INT_8: return type_id::INT8; + case UINT_16: return type_id::UINT16; + case INT_16: return type_id::INT16; + case UINT_32: return type_id::UINT32; + case UINT_64: return type_id::UINT64; + case DATE: return type_id::TIMESTAMP_DAYS; + case TIME_MILLIS: return type_id::DURATION_MILLISECONDS; + case TIME_MICROS: return type_id::DURATION_MICROSECONDS; + case TIMESTAMP_MILLIS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MILLISECONDS; - case parquet::TIMESTAMP_MICROS: + case TIMESTAMP_MICROS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MICROSECONDS; - case parquet::DECIMAL: - if (physical == parquet::INT32) { return type_id::DECIMAL32; } - if (physical == parquet::INT64) { return type_id::DECIMAL64; } - if (physical == parquet::FIXED_LEN_BYTE_ARRAY) { + case DECIMAL: + if (physical == INT32) { return type_id::DECIMAL32; } + if (physical == INT64) { return type_id::DECIMAL64; } + if (physical == FIXED_LEN_BYTE_ARRAY) { if (schema.type_length <= static_cast(sizeof(int32_t))) { return type_id::DECIMAL32; } @@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema, return type_id::DECIMAL128; } } - if (physical == parquet::BYTE_ARRAY) { + if (physical == BYTE_ARRAY) { CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); if (decimal_precision <= MAX_DECIMAL32_PRECISION) { return type_id::DECIMAL32; @@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema, break; // maps are just List>. - case parquet::MAP: - case parquet::LIST: return type_id::LIST; - case parquet::NA: return type_id::STRING; + case MAP: + case LIST: return type_id::LIST; + case NA: return type_id::STRING; // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support default: break; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIMESTAMP.unit.isset.NANOS) { return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIME.unit.isset.NANOS) { return type_id::DURATION_NANOSECONDS; } @@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema, // Physical storage type supported by Parquet; controls the on-disk storage // format in combination with the encoding type. switch (physical) { - case parquet::BOOLEAN: return type_id::BOOL8; - case parquet::INT32: return type_id::INT32; - case parquet::INT64: return type_id::INT64; - case parquet::FLOAT: return type_id::FLOAT32; - case parquet::DOUBLE: return type_id::FLOAT64; - case parquet::BYTE_ARRAY: - case parquet::FIXED_LEN_BYTE_ARRAY: + case BOOLEAN: return type_id::BOOL8; + case INT32: return type_id::INT32; + case INT64: return type_id::INT64; + case FLOAT: return type_id::FLOAT32; + case DOUBLE: return type_id::FLOAT64; + case BYTE_ARRAY: + case FIXED_LEN_BYTE_ARRAY: // Can be mapped to INT32 (32-bit hash) or STRING return strings_to_categorical ? type_id::INT32 : type_id::STRING; - case parquet::INT96: + case INT96: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; default: break; @@ -175,6 +175,81 @@ type_id to_type_id(SchemaElement const& schema, return type_id::EMPTY; } +void metadata::sanitize_schema() +{ + // Parquet isn't very strict about incoming metadata. Lots of things can and should be inferred. + // There are also a lot of rules that simply aren't followed and are expected to be worked around. + // This step sanitizes the metadata to something that isn't ambiguous. + // + // Take, for example, the following schema: + // + // required group field_id=-1 user { + // required int32 field_id=-1 id; + // optional group field_id=-1 phoneNumbers { + // repeated group field_id=-1 phone { + // required int64 field_id=-1 number; + // optional binary field_id=-1 kind (String); + // } + // } + // } + // + // This real-world example has no annotations telling us what is a list or a struct. On the + // surface this looks like a column of id's and a column of list>, but this + // actually should be interpreted as a struct>>. The phoneNumbers field + // has to be a struct because it is a group with no repeated tag and we have no annotation. The + // repeated group is actually BOTH a struct due to the multiple children and a list due to + // repeated. + // + // This code attempts to make this less messy for the code that follows. + + std::function process = [&](size_t schema_idx) -> void { + if (schema_idx < 0) { return; } + auto& schema_elem = schema[schema_idx]; + if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) { + auto const parent_type = schema[schema_elem.parent_idx].converted_type; + if (schema_elem.repetition_type == REPEATED && schema_elem.num_children > 1 && + parent_type != LIST && parent_type != MAP) { + // This is a list of structs, so we need to mark this as a list, but also + // add a struct child and move this element's children to the struct + schema_elem.converted_type = LIST; + schema_elem.repetition_type = OPTIONAL; + auto const struct_node_idx = schema.size(); + + SchemaElement struct_elem; + struct_elem.name = "struct_node"; + struct_elem.repetition_type = REQUIRED; + struct_elem.num_children = schema_elem.num_children; + struct_elem.type = UNDEFINED_TYPE; + struct_elem.converted_type = UNKNOWN; + + // swap children + struct_elem.children_idx = std::move(schema_elem.children_idx); + schema_elem.children_idx = {struct_node_idx}; + schema_elem.num_children = 1; + + struct_elem.max_definition_level = schema_elem.max_definition_level; + struct_elem.max_repetition_level = schema_elem.max_repetition_level; + schema_elem.max_definition_level--; + schema_elem.max_repetition_level = schema[schema_elem.parent_idx].max_repetition_level; + + // change parent index on new node and on children + struct_elem.parent_idx = schema_idx; + for (auto& child_idx : struct_elem.children_idx) { + schema[child_idx].parent_idx = struct_node_idx; + } + // add our struct + schema.push_back(struct_elem); + } + } + + for (auto& child_idx : schema_elem.children_idx) { + process(child_idx); + } + }; + + process(0); +} + metadata::metadata(datasource* source) { constexpr auto header_len = sizeof(file_header_s); @@ -195,6 +270,7 @@ metadata::metadata(datasource* source) CompactProtocolReader cp(buffer->data(), ender->footer_len); CUDF_EXPECTS(cp.read(this), "Cannot parse metadata"); CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema"); + sanitize_schema(); } std::vector aggregate_reader_metadata::metadatas_from_sources( @@ -344,7 +420,7 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con return names; } -std::tuple> +std::tuple> aggregate_reader_metadata::select_row_groups( host_span const> row_group_indices, int64_t skip_rows_opt, @@ -362,7 +438,7 @@ aggregate_reader_metadata::select_row_groups( host_span const>(filtered_row_group_indices.value()); } } - std::vector selection; + std::vector selection; auto [rows_to_skip, rows_to_read] = [&]() { if (not row_group_indices.empty()) { return std::pair{}; } auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options( @@ -402,7 +478,7 @@ aggregate_reader_metadata::select_row_groups( } std::tuple, - std::vector, + std::vector, std::vector> aggregate_reader_metadata::select_columns(std::optional> const& use_names, bool include_index, @@ -420,17 +496,18 @@ aggregate_reader_metadata::select_columns(std::optional : -1; }; - std::vector output_columns; + std::vector output_columns; std::vector input_columns; std::vector nesting; // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is // not a child of "struct1" then the function will return false for "struct1" - std::function&, bool)> + std::function&, bool)> build_column = [&](column_name_info const* col_name_info, int schema_idx, - std::vector& out_col_array, + std::vector& out_col_array, bool has_list_parent) { if (schema_idx < 0) { return false; } auto const& schema_elem = get_schema(schema_idx); @@ -445,13 +522,16 @@ aggregate_reader_metadata::select_columns(std::optional child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent); } + auto const one_level_list = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx)); + // if we're at the root, this is a new output column - auto const col_type = schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx)) + auto const col_type = one_level_list ? type_id::LIST : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); - inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer output_col(dtype, + schema_elem.repetition_type == OPTIONAL); if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } // store the index of this element if inserted in out_col_array nesting.push_back(static_cast(out_col_array.size())); @@ -485,13 +565,14 @@ aggregate_reader_metadata::select_columns(std::optional input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0}); // set up child output column for one-level encoding list - if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { + if (one_level_list) { // determine the element data type auto const element_type = to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const element_dtype = to_data_type(element_type, schema_elem); - inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer element_col( + element_dtype, schema_elem.repetition_type == OPTIONAL); if (has_list_parent || col_type == type_id::LIST) { element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } @@ -506,9 +587,7 @@ aggregate_reader_metadata::select_columns(std::optional std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting)); // pop off the extra nesting element. - if (schema_elem.is_one_level_list(get_schema(schema_elem.parent_idx))) { - nesting.pop_back(); - } + if (one_level_list) { nesting.pop_back(); } path_is_valid = true; // If we're able to reach leaf then path is valid } @@ -656,4 +735,4 @@ aggregate_reader_metadata::select_columns(std::optional std::move(input_columns), std::move(output_columns), std::move(output_column_schemas)); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 61e4f94df0f..1a73e2f55ac 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -32,9 +32,7 @@ #include #include -namespace cudf::io::detail::parquet { - -using namespace cudf::io::parquet; +namespace cudf::io::parquet::detail { /** * @brief Function that translates Parquet datatype to cuDF type enum @@ -58,6 +56,7 @@ using namespace cudf::io::parquet; */ struct metadata : public FileMetaData { explicit metadata(datasource* source); + void sanitize_schema(); }; class aggregate_reader_metadata { @@ -181,7 +180,7 @@ class aggregate_reader_metadata { * @return A tuple of corrected row_start, row_count and list of row group indexes and its * starting row */ - [[nodiscard]] std::tuple> select_row_groups( + [[nodiscard]] std::tuple> select_row_groups( host_span const> row_group_indices, int64_t row_start, std::optional const& row_count, @@ -201,12 +200,13 @@ class aggregate_reader_metadata { * @return input column information, output column information, list of output column schema * indices */ - [[nodiscard]] std:: - tuple, std::vector, std::vector> - select_columns(std::optional> const& use_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) const; + [[nodiscard]] std::tuple, + std::vector, + std::vector> + select_columns(std::optional> const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const; }; /** @@ -275,4 +275,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer std::list _operators; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index c731c467f2c..4bc6bb6f43b 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -43,7 +43,8 @@ #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { + namespace { /** @@ -185,11 +186,11 @@ template */ [[nodiscard]] std::tuple conversion_info(type_id column_type_id, type_id timestamp_type_id, - parquet::Type physical, + Type physical, int8_t converted, int32_t length) { - int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; int32_t clock_rate = 0; if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { type_width = 1; // I32 -> I8 @@ -202,9 +203,9 @@ template } int8_t converted_type = converted; - if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && + if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal + converted_type = UNKNOWN; // Not converting to float64 or decimal } return std::make_tuple(type_width, clock_rate, converted_type); } @@ -226,7 +227,7 @@ template [[nodiscard]] std::future read_column_chunks_async( std::vector> const& sources, std::vector>& page_data, - cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, std::vector const& column_chunk_offsets, @@ -239,11 +240,10 @@ template size_t const io_offset = column_chunk_offsets[chunk]; size_t io_size = chunks[chunk].compressed_size; size_t next_chunk = chunk + 1; - bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED); + bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED); while (next_chunk < end_chunk) { - size_t const next_offset = column_chunk_offsets[next_chunk]; - bool const is_next_compressed = - (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED); + size_t const next_offset = column_chunk_offsets[next_chunk]; + bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED); if (next_offset != io_offset + io_size || is_next_compressed != is_compressed || chunk_source_map[chunk] != chunk_source_map[next_chunk]) { // Can't merge if not contiguous or mixing compressed and uncompressed @@ -300,13 +300,13 @@ template * * @return The total number of pages */ -[[nodiscard]] size_t count_page_headers( - cudf::detail::hostdevice_vector& chunks, rmm::cuda_stream_view stream) +[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector& chunks, + rmm::cuda_stream_view stream) { size_t total_pages = 0; chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); chunks.device_to_host_sync(stream); for (size_t c = 0; c < chunks.size(); c++) { @@ -337,8 +337,8 @@ constexpr bool is_supported_encoding(Encoding enc) * @param stream CUDA stream used for device memory operations and kernel launches * @returns The size in bytes of level type data required */ -int decode_page_headers(cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, +int decode_page_headers(cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), @@ -350,14 +350,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c } chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); // compute max bytes needed for level data auto level_bit_size = cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) { auto c = chunks[i]; return static_cast( - max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION])); + max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION])); }); // max level data bit size. int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), @@ -388,11 +388,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c * @return Device buffer to decompressed page data */ [[nodiscard]] rmm::device_buffer decompress_page_data( - cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { - auto for_each_codec_page = [&](parquet::Compression codec, std::function const& f) { + auto for_each_codec_page = [&](Compression codec, std::function const& f) { for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { const auto page_stride = chunks[c].max_num_pages; if (chunks[c].codec == codec) { @@ -412,19 +412,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c size_t total_decomp_size = 0; struct codec_stats { - parquet::Compression compression_type = UNCOMPRESSED; - size_t num_pages = 0; - int32_t max_decompressed_size = 0; - size_t total_decomp_size = 0; + Compression compression_type = UNCOMPRESSED; + size_t num_pages = 0; + int32_t max_decompressed_size = 0; + size_t total_decomp_size = 0; }; - std::array codecs{codec_stats{parquet::GZIP}, - codec_stats{parquet::SNAPPY}, - codec_stats{parquet::BROTLI}, - codec_stats{parquet::ZSTD}}; + std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}}; auto is_codec_supported = [&codecs](int8_t codec) { - if (codec == parquet::UNCOMPRESSED) return true; + if (codec == UNCOMPRESSED) return true; return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { return codec == cstats.compression_type; }) != codecs.end(); @@ -445,7 +442,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.num_pages++; num_comp_pages++; }); - if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) { + if (codec.compression_type == BROTLI && codec.num_pages > 0) { debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream); } } @@ -482,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c auto& page = pages[page_idx]; // offset will only be non-zero for V2 pages auto const offset = - page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION]; + page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION]; // for V2 need to copy def and rep level info into place, and then offset the // input and output buffers. otherwise we'd have to keep both the compressed // and decompressed data. @@ -509,11 +506,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { - case parquet::GZIP: + case GZIP: gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); break; - case parquet::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { + case SNAPPY: + if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, d_comp_in, d_comp_out, @@ -525,7 +522,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); } break; - case parquet::ZSTD: + case ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, d_comp_in, d_comp_out, @@ -534,7 +531,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.total_decomp_size, stream); break; - case parquet::BROTLI: + case BROTLI: gpu_debrotli(d_comp_in, d_comp_out, d_comp_res_view, @@ -594,9 +591,9 @@ void reader::impl::allocate_nesting_info() }); page_nesting_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; page_nesting_decode_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; // update pointers in the PageInfos int target_page_index = 0; @@ -653,10 +650,10 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - gpu::PageNestingInfo* pni = + PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; - gpu::PageNestingDecodeInfo* nesting_info = + PageNestingDecodeInfo* nesting_info = &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; // if we have lists, set our start and end depth remappings @@ -717,9 +714,9 @@ void reader::impl::allocate_level_decode_space() for (size_t idx = 0; idx < pages.size(); idx++) { auto& p = pages[idx]; - p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf; + p.lvl_decode_buf[level_type::DEFINITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); - p.lvl_decode_buf[gpu::level_type::REPETITION] = buf; + p.lvl_decode_buf[level_type::REPETITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); } } @@ -824,25 +821,25 @@ void reader::impl::load_global_chunk_info() schema.converted_type, schema.type_length); - chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); + chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); } remaining_rows -= row_group_rows; @@ -909,7 +906,7 @@ void reader::impl::compute_input_pass_row_group_info() void reader::impl::setup_pass() { // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); // setup row groups to be loaded for this pass auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass]; @@ -929,8 +926,7 @@ void reader::impl::setup_pass() auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - _pass_itm_data->chunks = - cudf::detail::hostdevice_vector(num_chunks, _stream); + _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); // adjust skip_rows and num_rows by what's available in the row groups we are processing @@ -970,7 +966,7 @@ void reader::impl::load_and_decompress_data() // Process dataset chunk pages into output columns auto const total_pages = count_page_headers(chunks, _stream); if (total_pages <= 0) { return; } - pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); + pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); // decoding of column/page information _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream); @@ -978,7 +974,7 @@ void reader::impl::load_and_decompress_data() decomp_page_data = decompress_page_data(chunks, pages, _stream); // Free compressed data for (size_t c = 0; c < chunks.size(); c++) { - if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } + if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } } } @@ -1019,14 +1015,13 @@ struct cumulative_row_info { }; #if defined(PREPROCESS_DEBUG) -void print_pages(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view _stream) +void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view _stream) { pages.device_to_host_sync(_stream); for (size_t idx = 0; idx < pages.size(); idx++) { auto const& p = pages[idx]; // skip dictionary pages - if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } + if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } printf( "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), " "str_bytes(%d)\n", @@ -1040,7 +1035,7 @@ void print_pages(cudf::detail::hostdevice_vector& pages, } } -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, rmm::device_uvector const& page_index, rmm::device_uvector const& c_info, rmm::cuda_stream_view stream) @@ -1067,7 +1062,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& printf("Schema %d\n", schemas[idx]); for (size_t pidx = 0; pidx < pages.size(); pidx++) { auto const& page = pages[h_page_index[pidx]]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { continue; } printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); @@ -1075,10 +1070,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& } } -void print_cumulative_row_info( - host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) +void print_cumulative_row_info(host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) { if (splits.has_value()) { printf("------------\nSplits\n"); @@ -1093,7 +1087,7 @@ void print_cumulative_row_info( if (splits.has_value()) { // if we have a split at this row count and this is the last instance of this row count auto start = thrust::make_transform_iterator( - splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); + splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); auto end = start + splits->size(); auto split = std::find(start, end, sizes[idx].row_count); auto const split_index = [&]() -> int { @@ -1180,12 +1174,12 @@ __device__ size_t row_size_functor::operator()(size_t num_rows, boo * Sums across all nesting levels. */ struct get_cumulative_row_info { - gpu::PageInfo const* const pages; + PageInfo const* const pages; __device__ cumulative_row_info operator()(size_type index) { auto const& page = pages[index]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return cumulative_row_info{0, 0, page.src_col_schema}; } @@ -1250,15 +1244,15 @@ struct row_total_size { * @param num_rows Total number of rows to read * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) { // now we have an array of {row_count, real output bytes}. just walk through it and generate // splits. // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; + std::vector splits; { size_t cur_pos = 0; size_t cur_cumulative_size = 0; @@ -1290,7 +1284,7 @@ std::vector find_splits(std::vector c auto const start_row = cur_row_count; cur_row_count = sizes[split_pos].row_count; - splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); + splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); cur_pos = split_pos; cur_cumulative_size = sizes[split_pos].size_bytes; } @@ -1311,12 +1305,11 @@ std::vector find_splits(std::vector c * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns * @param stream CUDA stream to use */ -std::vector compute_splits( - cudf::detail::hostdevice_vector& pages, - gpu::pass_intermediate_data const& id, - size_t num_rows, - size_t chunk_read_limit, - rmm::cuda_stream_view stream) +std::vector compute_splits(cudf::detail::hostdevice_vector& pages, + pass_intermediate_data const& id, + size_t num_rows, + size_t chunk_read_limit, + rmm::cuda_stream_view stream) { auto const& page_keys = id.page_keys; auto const& page_index = id.page_index; @@ -1395,16 +1388,16 @@ std::vector compute_splits( } struct get_page_chunk_idx { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; } + __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; } }; struct get_page_num_rows { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; } + __device__ size_type operator()(PageInfo const& page) { return page.num_rows; } }; struct get_page_column_index { - gpu::ColumnChunkDesc const* chunks; - __device__ size_type operator()(gpu::PageInfo const& page) + ColumnChunkDesc const* chunks; + __device__ size_type operator()(PageInfo const& page) { return chunks[page.chunk_idx].src_col_index; } @@ -1441,7 +1434,7 @@ struct get_page_nesting_size { input_col_info const* const input_cols; size_type const max_depth; size_t const num_pages; - gpu::PageInfo const* const pages; + PageInfo const* const pages; int const* page_indices; __device__ size_type operator()(size_t index) const @@ -1450,7 +1443,7 @@ struct get_page_nesting_size { auto const& page = pages[page_indices[indices.page_idx]]; if (page.src_col_schema != input_cols[indices.col_idx].schema_idx || - page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + page.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return 0; } @@ -1468,7 +1461,7 @@ struct get_reduction_key { * @brief Writes to the chunk_row field of the PageInfo struct. */ struct chunk_row_output_iter { - gpu::PageInfo* p; + PageInfo* p; using value_type = size_type; using difference_type = size_type; using pointer = size_type*; @@ -1490,7 +1483,7 @@ struct chunk_row_output_iter { * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema. */ struct start_offset_output_iterator { - gpu::PageInfo const* pages; + PageInfo const* pages; int const* page_indices; size_t cur_index; input_col_info const* input_cols; @@ -1529,9 +1522,9 @@ struct start_offset_output_iterator { { auto const indices = reduction_indices{index, max_depth, num_pages}; - gpu::PageInfo const& p = pages[page_indices[indices.page_idx]]; + PageInfo const& p = pages[page_indices[indices.page_idx]]; if (p.src_col_schema != input_cols[indices.col_idx].schema_idx || - p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + p.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return empty; } @@ -1540,15 +1533,15 @@ struct start_offset_output_iterator { }; struct flat_column_num_rows { - gpu::PageInfo const* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo const* pages; + ColumnChunkDesc const* chunks; __device__ size_type operator()(size_type pindex) const { - gpu::PageInfo const& page = pages[pindex]; + PageInfo const& page = pages[pindex]; // ignore dictionary pages and pages belonging to any column containing repetition (lists) - if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) || - (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) { + if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) || + (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) { return 0; } return page.num_rows; @@ -1581,8 +1574,8 @@ struct row_counts_different { * @param expected_row_count Expected row count, if applicable * @param stream CUDA stream used for device memory operations and kernel launches */ -void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, device_span page_keys, device_span page_index, std::optional expected_row_count, @@ -1631,23 +1624,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector& page } struct page_to_string_size { - gpu::PageInfo* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo* pages; + ColumnChunkDesc const* chunks; __device__ size_t operator()(size_type page_idx) const { auto const page = pages[page_idx]; auto const chunk = chunks[page.chunk_idx]; - if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) { - return 0; - } + if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; } return pages[page_idx].str_bytes; } }; struct page_offset_output_iter { - gpu::PageInfo* p; + PageInfo* p; size_type const* index; using value_type = size_type; @@ -1738,7 +1729,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; break; @@ -1749,7 +1740,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // generate string dict indices if necessary { - auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) { + auto is_dict_chunk = [](ColumnChunkDesc const& chunk) { return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; }; @@ -1785,7 +1776,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re if (total_str_dict_indexes > 0) { chunks.host_to_device_async(_stream); - gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); + BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); } } @@ -1800,14 +1791,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // if: // - user has passed custom row bounds // - we will be doing a chunked read - gpu::ComputePageSizes(pages, - chunks, - 0, // 0-max size_t. process all possible rows - std::numeric_limits::max(), - true, // compute num_rows - chunk_read_limit > 0, // compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + 0, // 0-max size_t. process all possible rows + std::numeric_limits::max(), + true, // compute num_rows + chunk_read_limit > 0, // compute string sizes + _pass_itm_data->level_type_size, + _stream); // computes: // PageInfo::chunk_row (the absolute start row index) for all pages @@ -1836,7 +1827,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re _pass_itm_data->output_chunk_read_info = _output_chunk_read_limit > 0 ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream) - : std::vector{{skip_rows, num_rows}}; + : std::vector{{skip_rows, num_rows}}; } void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) @@ -1853,14 +1844,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds // is set (if the user has specified artificial bounds). if (uses_custom_row_bounds) { - gpu::ComputePageSizes(pages, - chunks, - skip_rows, - num_rows, - false, // num_rows is already computed - false, // no need to compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + skip_rows, + num_rows, + false, // num_rows is already computed + false, // no need to compute string sizes + _pass_itm_data->level_type_size, + _stream); // print_pages(pages, _stream); } @@ -1879,7 +1870,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; } @@ -2014,4 +2005,4 @@ std::vector reader::impl::calculate_page_string_offsets() return col_sizes; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 2545a074a38..799d6d9fd64 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -20,7 +20,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { template constexpr int rle_stream_required_run_buffer_size() @@ -362,4 +362,4 @@ struct rle_stream { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a124f352ee4..50589f23626 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -54,12 +54,9 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { -using namespace cudf::io::parquet; -using namespace cudf::io; +namespace cudf::io::parquet::detail { + +using namespace cudf::io::detail; struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, @@ -185,13 +182,13 @@ namespace { * @param compression The compression type * @return The supported Parquet compression */ -parquet::Compression to_parquet_compression(compression_type compression) +Compression to_parquet_compression(compression_type compression) { switch (compression) { case compression_type::AUTO: - case compression_type::SNAPPY: return parquet::Compression::SNAPPY; - case compression_type::ZSTD: return parquet::Compression::ZSTD; - case compression_type::NONE: return parquet::Compression::UNCOMPRESSED; + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::NONE: return Compression::UNCOMPRESSED; default: CUDF_FAIL("Unsupported compression type"); } } @@ -206,7 +203,7 @@ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) { for (uint8_t enc = 0; enc < static_cast(Encoding::NUM_ENCODINGS); enc++) { auto const enc_enum = static_cast(enc); - if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } + if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } } } @@ -761,11 +758,11 @@ struct parquet_column_view { std::vector const& schema_tree, rmm::cuda_stream_view stream); - [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; + [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; [[nodiscard]] column_view cudf_column_view() const { return cudf_col; } - [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; } - [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; } + [[nodiscard]] Type physical_type() const { return schema_node.type; } + [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; } std::vector const& get_path_in_schema() { return path_in_schema; } @@ -846,11 +843,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, uint16_t max_rep_level = 0; curr_schema_node = schema_node; while (curr_schema_node.parent_idx != -1) { - if (curr_schema_node.repetition_type == parquet::REPEATED or - curr_schema_node.repetition_type == parquet::OPTIONAL) { + if (curr_schema_node.repetition_type == REPEATED or + curr_schema_node.repetition_type == OPTIONAL) { ++max_def_level; } - if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; } + if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; } curr_schema_node = schema_tree[curr_schema_node.parent_idx]; } CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported"); @@ -897,9 +894,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, } } -gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const +parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const { - auto desc = gpu::parquet_column_device_view{}; // Zero out all fields + auto desc = parquet_column_device_view{}; // Zero out all fields desc.stats_dtype = schema_node.stats_dtype; desc.ts_scale = schema_node.ts_scale; @@ -931,8 +928,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s * @param fragment_size Number of rows per fragment * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, - device_span col_desc, +void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, + device_span col_desc, host_span partitions, device_span part_frag_offset, uint32_t fragment_size, @@ -940,7 +937,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector frag, +void calculate_page_fragments(device_span frag, host_span frag_sizes, rmm::cuda_stream_view stream) { auto d_frag_sz = cudf::detail::make_device_uvector_async( frag_sizes, stream, rmm::mr::get_current_device_resource()); - gpu::CalculatePageFragments(frag, d_frag_sz, stream); + CalculatePageFragments(frag, d_frag_sz, stream); } /** @@ -972,13 +969,13 @@ void calculate_page_fragments(device_span frag, * @param stream CUDA stream used for device memory operations and kernel launches */ void gather_fragment_statistics(device_span frag_stats, - device_span frags, + device_span frags, bool int96_timestamps, rmm::cuda_stream_view stream) { rmm::device_uvector frag_stats_group(frag_stats.size(), stream); - gpu::InitFragmentStatistics(frag_stats_group, frags, stream); + InitFragmentStatistics(frag_stats_group, frags, stream); detail::calculate_group_statistics( frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps); stream.synchronize(); @@ -1008,8 +1005,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); } -auto init_page_sizes(hostdevice_2dvector& chunks, - device_span col_desc, +auto init_page_sizes(hostdevice_2dvector& chunks, + device_span col_desc, uint32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, @@ -1021,19 +1018,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); // Calculate number of pages and store in respective chunks - gpu::InitEncoderPages(chunks, - {}, - {}, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); int num_pages = 0; @@ -1046,19 +1043,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Now that we know the number of pages, allocate an array to hold per page size and get it // populated cudf::detail::hostdevice_vector page_sizes(num_pages, stream); - gpu::InitEncoderPages(chunks, - {}, - page_sizes, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + page_sizes, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); page_sizes.device_to_host_sync(stream); // Get per-page max compressed size @@ -1072,26 +1069,26 @@ auto init_page_sizes(hostdevice_2dvector& chunks, comp_page_sizes.host_to_device_async(stream); // Use per-page max compressed size to calculate chunk.compressed_size - gpu::InitEncoderPages(chunks, - {}, - {}, - comp_page_sizes, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + comp_page_sizes, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); return comp_page_sizes; } size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) { - if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; } + if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; } auto const ncomp_type = to_nvcomp_compression_type(compression); auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) @@ -1104,9 +1101,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) } std::pair>, std::vector>> -build_chunk_dictionaries(hostdevice_2dvector& chunks, - host_span col_desc, - device_2dspan frags, +build_chunk_dictionaries(hostdevice_2dvector& chunks, + host_span col_desc, + device_2dspan frags, Compression compression, dictionary_policy dict_policy, size_t max_dict_size, @@ -1130,7 +1127,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } // Allocate slots for each chunk - std::vector> hash_maps_storage; + std::vector> hash_maps_storage; hash_maps_storage.reserve(h_chunks.size()); for (auto& chunk : h_chunks) { if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN || @@ -1149,8 +1146,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); - gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); - gpu::populate_chunk_hash_maps(frags, stream); + initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); + populate_chunk_hash_maps(frags, stream); chunks.device_to_host_sync(stream); @@ -1197,8 +1194,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - gpu::collect_map_entries(chunks.device_view().flat_view(), stream); - gpu::get_dictionary_indices(frags, stream); + collect_map_entries(chunks.device_view().flat_view(), stream); + get_dictionary_indices(frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } @@ -1221,9 +1218,9 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, * @param write_v2_headers True if version 2 page headers are to be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_encoder_pages(hostdevice_2dvector& chunks, - device_span col_desc, - device_span pages, +void init_encoder_pages(hostdevice_2dvector& chunks, + device_span col_desc, + device_span pages, cudf::detail::hostdevice_vector& comp_page_sizes, statistics_chunk* page_stats, statistics_chunk* frag_stats, @@ -1286,8 +1283,8 @@ void init_encoder_pages(hostdevice_2dvector& chunks, * @param write_v2_headers True if V2 page headers should be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void encode_pages(hostdevice_2dvector& chunks, - device_span pages, +void encode_pages(hostdevice_2dvector& chunks, + device_span pages, uint32_t pages_in_batch, uint32_t first_page_in_batch, uint32_t rowgroups_in_batch, @@ -1308,8 +1305,7 @@ void encode_pages(hostdevice_2dvector& chunks, ? device_span(page_stats + first_page_in_batch, pages_in_batch) : device_span(); - uint32_t max_comp_pages = - (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0; + uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1319,9 +1315,9 @@ void encode_pages(hostdevice_2dvector& chunks, comp_res.end(), compression_result{0, compression_status::FAILURE}); - gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); + EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); switch (compression) { - case parquet::Compression::SNAPPY: + case Compression::SNAPPY: if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { gpu_snap(comp_in, comp_out, comp_res, stream); } else { @@ -1329,7 +1325,7 @@ void encode_pages(hostdevice_2dvector& chunks, nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); } break; - case parquet::Compression::ZSTD: { + case Compression::ZSTD: { if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); reason) { CUDF_FAIL("Compression error: " + reason.value()); @@ -1338,7 +1334,7 @@ void encode_pages(hostdevice_2dvector& chunks, break; } - case parquet::Compression::UNCOMPRESSED: break; + case Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } @@ -1378,7 +1374,7 @@ void encode_pages(hostdevice_2dvector& chunks, * @param column_index_truncate_length maximum length of min or max values in column index, in bytes * @return Computed buffer size needed to encode the column index */ -size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length) +size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length) { // encoding the column index for a given chunk requires: // each list (4 of them) requires 6 bytes of overhead @@ -1499,8 +1495,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::vector this_table_schema(schema_tree.begin(), schema_tree.end()); // Initialize column description - cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), - stream); + cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), + stream); std::transform( parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) { return pcol.get_device_view(stream); @@ -1576,7 +1572,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto d_part_frag_offset = cudf::detail::make_device_uvector_async( part_frag_offset, stream, rmm::mr::get_current_device_resource()); - cudf::detail::hostdevice_2dvector row_group_fragments( + cudf::detail::hostdevice_2dvector row_group_fragments( num_columns, num_fragments, stream); // Create table_device_view so that corresponding column_device_view data @@ -1588,7 +1584,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (num_fragments != 0) { // Move column info to device col_desc.host_to_device_async(stream); - leaf_column_views = create_leaf_column_device_views( + leaf_column_views = create_leaf_column_device_views( col_desc, *parent_column_table_device_view, stream); init_row_group_fragments(row_group_fragments, @@ -1662,7 +1658,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // Initialize row groups and column chunks auto const num_chunks = num_rowgroups * num_columns; - hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); + hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); // total fragments per column (in case they are non-uniform) std::vector frags_per_column(num_columns, 0); @@ -1678,7 +1674,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, row_group.total_byte_size = 0; row_group.columns.resize(num_columns); for (int c = 0; c < num_columns; c++) { - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; ck = {}; ck.col_desc = col_desc.device_ptr() + c; @@ -1700,7 +1696,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, return l + r.num_values; }); ck.plain_data_size = std::accumulate( - chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { + chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) { return sum + frag.fragment_data_size; }); auto& column_chunk_meta = row_group.columns[c].meta_data; @@ -1731,7 +1727,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back(); rmm::device_uvector frag_stats(0, stream); - cudf::detail::hostdevice_vector page_fragments(total_frags, stream); + cudf::detail::hostdevice_vector page_fragments(total_frags, stream); // update fragments and/or prepare for fragment statistics calculation if necessary if (total_frags != 0) { @@ -1749,9 +1745,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const& row_group = agg_meta->file(p).row_groups[global_r]; uint32_t const fragments_in_chunk = util::div_rounding_up_unsafe(row_group.num_rows, frag_size); - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; - ck.fragments = page_fragments.device_ptr(frag_offset); - ck.first_fragment = frag_offset; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + ck.fragments = page_fragments.device_ptr(frag_offset); + ck.first_fragment = frag_offset; // update the chunk pointer here for each fragment in chunk.fragments for (uint32_t i = 0; i < fragments_in_chunk; i++) { @@ -1817,8 +1813,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_t comp_rowgroup_size = 0; if (r < num_rowgroups) { for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; - ck->first_page = num_pages; + EncColumnChunk* ck = &chunks[r][i]; + ck->first_page = num_pages; num_pages += ck->num_pages; pages_in_batch += ck->num_pages; rowgroup_size += ck->bfr_size; @@ -1850,7 +1846,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } // Clear compressed buffer size if compression has been turned off - if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } + if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } // Initialize data pointers in batch uint32_t const num_stats_bfr = @@ -1864,7 +1860,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, stream); rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream); - rmm::device_uvector pages(num_pages, stream); + rmm::device_uvector pages(num_pages, stream); // This contains stats for both the pages and the rowgroups. TODO: make them separate. rmm::device_uvector page_stats(num_stats_bfr, stream); @@ -1874,10 +1870,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto bfr_c = static_cast(comp_bfr.data()); for (auto j = 0; j < batch_list[b]; j++, r++) { for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk& ck = chunks[r][i]; - ck.uncompressed_bfr = bfr; - ck.compressed_bfr = bfr_c; - ck.column_index_blob = bfr_i; + EncColumnChunk& ck = chunks[r][i]; + ck.uncompressed_bfr = bfr; + ck.compressed_bfr = bfr_c; + ck.column_index_blob = bfr_i; bfr += ck.bfr_size; bfr_c += ck.compressed_size; if (stats_granularity == statistics_freq::STATISTICS_COLUMN) { @@ -1960,7 +1956,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (ck.ck_stat_size != 0) { std::vector const stats_blob = cudf::detail::make_std_vector_sync( device_span(dev_bfr, ck.ck_stat_size), stream); - cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); + CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); cp.read(&column_chunk_meta.statistics); need_sync = true; } @@ -2142,8 +2138,8 @@ void writer::impl::write(table_view const& input, std::vector co void writer::impl::write_parquet_data_to_sink( std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -2209,7 +2205,7 @@ void writer::impl::write_parquet_data_to_sink( int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto const& row_group = _agg_meta->file(p).row_groups[global_r]; for (std::size_t i = 0; i < num_columns; i++) { - gpu::EncColumnChunk const& ck = chunks[r][i]; + EncColumnChunk const& ck = chunks[r][i]; auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index @@ -2392,7 +2388,4 @@ std::unique_ptr> writer::merge_row_group_metadata( return std::make_unique>(std::move(output)); } -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 89ef85ba2bd..1d27a8400c8 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -38,15 +38,11 @@ #include #include -namespace cudf { -namespace io { -namespace detail { -namespace parquet { +namespace cudf::io::parquet::detail { + // Forward internal classes struct aggregate_writer_metadata; -using namespace cudf::io::parquet; -using namespace cudf::io; using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; using cudf::detail::hostdevice_2dvector; @@ -66,7 +62,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -79,7 +75,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -139,8 +135,8 @@ class writer::impl { * @param[out] bounce_buffer Temporary host output buffer */ void write_parquet_data_to_sink(std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -164,9 +160,10 @@ class writer::impl { bool const _write_v2_headers; int32_t const _column_index_truncate_length; std::vector> const _kv_meta; // Optional user metadata. - single_write_mode const _single_write_mode; // Special parameter only used by `write()` to - // indicate that we are guaranteeing a single table - // write. This enables some internal optimizations. + cudf::io::detail::single_write_mode const + _single_write_mode; // Special parameter only used by `write()` to + // indicate that we are guaranteeing a single table + // write. This enables some internal optimizations. std::vector> const _out_sink; // Internal states, filled during `write()` and written to sink during `write` and `close()`. @@ -180,7 +177,4 @@ class writer::impl { bool _closed = false; // To track if the output has been written to sink. }; -} // namespace parquet -} // namespace detail -} // namespace io -} // namespace cudf +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index fbe297765f8..99dbd55678b 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -271,10 +271,11 @@ std::unique_ptr concatenate_list_elements(column_view const& input, */ std::unique_ptr concatenate_list_elements(column_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_list_elements(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu index 658538b0195..49be7b5ff17 100644 --- a/cpp/src/lists/combine/concatenate_rows.cu +++ b/cpp/src/lists/combine/concatenate_rows.cu @@ -305,10 +305,11 @@ std::unique_ptr concatenate_rows(table_view const& input, */ std::unique_ptr concatenate_rows(table_view const& input, concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr); + return detail::concatenate_rows(input, null_policy, stream, mr); } } // namespace lists diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index df1d043bdb6..4733a5d63a8 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -287,7 +287,7 @@ std::unique_ptr index_of(lists_column_view const& lists, } auto search_key_col = cudf::make_column_from_scalar(search_key, lists.size(), stream, mr); - return index_of(lists, search_key_col->view(), find_option, stream, mr); + return detail::index_of(lists, search_key_col->view(), find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, @@ -306,11 +306,11 @@ std::unique_ptr contains(lists_column_view const& lists, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto key_indices = index_of(lists, - search_key, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_key, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -322,11 +322,11 @@ std::unique_ptr contains(lists_column_view const& lists, CUDF_EXPECTS(search_keys.size() == lists.size(), "Number of search keys must match list column size."); - auto key_indices = index_of(lists, - search_keys, - duplicate_find_option::FIND_FIRST, - stream, - rmm::mr::get_current_device_resource()); + auto key_indices = detail::index_of(lists, + search_keys, + duplicate_find_option::FIND_FIRST, + stream, + rmm::mr::get_current_device_resource()); return to_contains(std::move(key_indices), stream, mr); } @@ -364,43 +364,48 @@ std::unique_ptr contains_nulls(lists_column_view const& lists, std::unique_ptr contains(lists_column_view const& lists, cudf::scalar const& search_key, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_key, cudf::get_default_stream(), mr); + return detail::contains(lists, search_key, stream, mr); } std::unique_ptr contains(lists_column_view const& lists, column_view const& search_keys, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains(lists, search_keys, cudf::get_default_stream(), mr); + return detail::contains(lists, search_keys, stream, mr); } std::unique_ptr contains_nulls(lists_column_view const& lists, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::contains_nulls(lists, cudf::get_default_stream(), mr); + return detail::contains_nulls(lists, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, cudf::scalar const& search_key, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_key, find_option, stream, mr); } std::unique_ptr index_of(lists_column_view const& lists, column_view const& search_keys, duplicate_find_option find_option, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr); + return detail::index_of(lists, search_keys, find_option, stream, mr); } } // namespace cudf::lists diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu index ddd0dfbe084..5407b88236f 100644 --- a/cpp/src/lists/copying/concatenate.cu +++ b/cpp/src/lists/copying/concatenate.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -123,8 +124,8 @@ std::unique_ptr concatenate(host_span columns, // if any of the input columns have nulls, construct the output mask bool const has_nulls = std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); - rmm::device_buffer null_mask = create_null_mask( - total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED); + rmm::device_buffer null_mask = cudf::detail::create_null_mask( + total_list_count, has_nulls ? mask_state::UNINITIALIZED : mask_state::UNALLOCATED, stream, mr); auto null_mask_data = static_cast(null_mask.data()); auto const null_count = has_nulls ? cudf::detail::concatenate_masks(columns, null_mask_data, stream) : size_type{0}; diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu index 40a14d805e1..2fd0851067a 100644 --- a/cpp/src/lists/count_elements.cu +++ b/cpp/src/lists/count_elements.cu @@ -73,10 +73,11 @@ std::unique_ptr count_elements(lists_column_view const& input, // external APIS std::unique_ptr count_elements(lists_column_view const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::count_elements(input, cudf::get_default_stream(), mr); + return detail::count_elements(input, stream, mr); } } // namespace lists diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu index 4b3f80fc6e2..41cce57d55b 100644 --- a/cpp/src/round/round.cu +++ b/cpp/src/round/round.cu @@ -271,7 +271,10 @@ std::unique_ptr round_with(column_view const& input, out_view.template end(), static_cast(0)); } else { - Type const n = std::pow(10, scale_movement); + Type n = 10; + for (int i = 1; i < scale_movement; ++i) { + n *= 10; + } thrust::transform(rmm::exec_policy(stream), input.begin(), input.end(), diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu index 4363bd212fe..85971647434 100644 --- a/cpp/src/search/contains_column.cu +++ b/cpp/src/search/contains_column.cu @@ -14,23 +14,14 @@ * limitations under the License. */ -#include - -#include #include #include #include #include #include #include -#include #include -#include - -#include -#include -#include namespace cudf { namespace detail { @@ -38,61 +29,7 @@ namespace detail { namespace { struct contains_column_dispatch { - template - struct contains_fn { - bool __device__ operator()(size_type const idx) const - { - if (needles_have_nulls && needles.is_null_nocheck(idx)) { - // Exit early. The value doesn't matter, and will be masked as a null element. - return true; - } - - return haystack.contains(needles.template element(idx)); - } - - Haystack const haystack; - column_device_view const needles; - bool const needles_have_nulls; - }; - - template ())> - std::unique_ptr operator()(column_view const& haystack, - column_view const& needles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const - { - auto result = make_numeric_column(data_type{type_to_id()}, - needles.size(), - copy_bitmask(needles, stream, mr), - needles.null_count(), - stream, - mr); - if (needles.is_empty()) { return result; } - - auto const out_begin = result->mutable_view().template begin(); - if (haystack.is_empty()) { - thrust::uninitialized_fill( - rmm::exec_policy(stream), out_begin, out_begin + needles.size(), false); - return result; - } - - auto const haystack_set = cudf::detail::unordered_multiset::create(haystack, stream); - auto const haystack_set_dv = haystack_set.to_device(); - auto const needles_cdv_ptr = column_device_view::create(needles, stream); - - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(needles.size()), - out_begin, - contains_fn{ - haystack_set_dv, *needles_cdv_ptr, needles.has_nulls()}); - - result->set_null_count(needles.null_count()); - - return result; - } - - template ())> + template std::unique_ptr operator()(column_view const& haystack, column_view const& needles, rmm::cuda_stream_view stream, @@ -144,8 +81,6 @@ std::unique_ptr contains(column_view const& haystack, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(haystack.type() == needles.type(), "DTYPE mismatch"); - return cudf::type_dispatcher( haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr); } diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index a622d1a742d..acc1502f4d6 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -97,7 +97,7 @@ struct replace_row_parallel_fn { } else { bytes += d_repl.size_bytes() - d_target.size_bytes(); } - position = d_str.find(d_target, position + d_target.size_bytes()); + position = d_str.find(d_target, position + d_target.length()); --max_n; } if (out_ptr) // copy whats left (or right depending on your point of view) diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index f40ace8d10b..1c81f266200 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -199,7 +199,13 @@ std::unique_ptr rescale(column_view input, } return output_column; } - auto const scalar = make_fixed_point_scalar(std::pow(10, -diff), scale_type{diff}, stream); + + RepType scalar_value = 10; + for (int i = 1; i < -diff; ++i) { + scalar_value *= 10; + } + + auto const scalar = make_fixed_point_scalar(scalar_value, scale_type{diff}, stream); return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr); } }; diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu index 961f3a9e720..d0cae81a9c8 100644 --- a/cpp/src/unary/math_ops.cu +++ b/cpp/src/unary/math_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -295,7 +295,11 @@ std::unique_ptr unary_op_with(column_view const& input, input.type(), input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr); auto out_view = result->mutable_view(); - Type const n = std::pow(10, -input.type().scale()); + + Type n = 10; + for (int i = 1; i < -input.type().scale(); ++i) { + n *= 10; + } thrust::transform(rmm::exec_policy(stream), input.begin(), diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 04939f3cd6d..ffaba7d6fa7 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -622,6 +622,7 @@ ConfigureTest( STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu ) +ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) @@ -637,6 +638,7 @@ ConfigureTest( ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 81e0e12eeb9..3e5d7033e60 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -200,29 +200,30 @@ std::unique_ptr make_parquet_list_list_col( // of the file to populate the FileMetaData pointed to by file_meta_data. // throws cudf::logic_error if the file or metadata is invalid. void read_footer(std::unique_ptr const& source, - cudf::io::parquet::FileMetaData* file_meta_data) + cudf::io::parquet::detail::FileMetaData* file_meta_data) { - constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s); - constexpr auto ender_len = sizeof(cudf::io::parquet::file_ender_s); + constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s); + constexpr auto ender_len = sizeof(cudf::io::parquet::detail::file_ender_s); auto const len = source->size(); auto const header_buffer = source->host_read(0, header_len); auto const header = - reinterpret_cast(header_buffer->data()); + reinterpret_cast(header_buffer->data()); auto const ender_buffer = source->host_read(len - ender_len, ender_len); - auto const ender = reinterpret_cast(ender_buffer->data()); + auto const ender = + reinterpret_cast(ender_buffer->data()); // checks for valid header, footer, and file length ASSERT_GT(len, header_len + ender_len); - ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic && - ender->magic == cudf::io::parquet::parquet_magic); + ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic && + ender->magic == cudf::io::parquet::detail::parquet_magic); ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len)); // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1" // seek backwards from the end of the file (footer_length + 8 bytes of ender) auto const footer_buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); - cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); + cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); // returns true on success bool res = cp.read(file_meta_data); @@ -233,14 +234,14 @@ void read_footer(std::unique_ptr const& source, // this assumes the data is uncompressed. // throws cudf::logic_error if the page_loc data is invalid. int read_dict_bits(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); @@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr const& source, // read column index from datasource at location indicated by chunk, // parse and return as a ColumnIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::ColumnIndex read_column_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::ColumnIndex read_column_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index"); CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length"); - cudf::io::parquet::ColumnIndex colidx; + cudf::io::parquet::detail::ColumnIndex colidx; auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); - cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); bool res = cp.read(&colidx); CUDF_EXPECTS(res, "Cannot parse column index"); return colidx; @@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index( // read offset index from datasource at location indicated by chunk, // parse and return as an OffsetIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::OffsetIndex read_offset_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::OffsetIndex read_offset_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index"); CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length"); - cudf::io::parquet::OffsetIndex offidx; + cudf::io::parquet::detail::OffsetIndex offidx; auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); - cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); bool res = cp.read(&offidx); CUDF_EXPECTS(res, "Cannot parse offset index"); return offidx; } // Return as a Statistics from the column chunk -cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::Statistics const& get_statistics( + cudf::io::parquet::detail::ColumnChunk const& chunk) { return chunk.meta_data.statistics; } @@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu // read page header from datasource at location indicated by page_loc, // parse and return as a PageHeader struct. // throws cudf::logic_error if the page_loc data is invalid. -cudf::io::parquet::PageHeader read_page_header(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) +cudf::io::parquet::detail::PageHeader read_page_header( + std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); return page_hdr; @@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_EQ(ph.data_page_header.num_values, page_rows); } @@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_LE(ph.data_page_header.num_values, rows_per_page); } @@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // check that file is written correctly when rows/page < fragment size auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_TRUE(fmd.row_groups.size() > 0); @@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); // there should be only one page since the fragment size is larger than rows_per_page EXPECT_EQ(ph.data_page_header.num_values, num_rows); @@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::BoundaryOrder expected_orders[] = { - cudf::io::parquet::BoundaryOrder::ASCENDING, - cudf::io::parquet::BoundaryOrder::DESCENDING, - cudf::io::parquet::BoundaryOrder::UNORDERED}; + cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { + cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); @@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2) // 1 if v1 > v2. int32_t compare_binary(std::vector const& v1, std::vector const& v2, - cudf::io::parquet::Type ptype, - cudf::io::parquet::ConvertedType ctype) + cudf::io::parquet::detail::Type ptype, + cudf::io::parquet::detail::ConvertedType ctype) { switch (ptype) { - case cudf::io::parquet::INT32: + case cudf::io::parquet::detail::INT32: switch (ctype) { - case cudf::io::parquet::UINT_8: - case cudf::io::parquet::UINT_16: - case cudf::io::parquet::UINT_32: + case cudf::io::parquet::detail::UINT_8: + case cudf::io::parquet::detail::UINT_16: + case cudf::io::parquet::detail::UINT_32: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); default: @@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector const& v1, *(reinterpret_cast(v2.data()))); } - case cudf::io::parquet::INT64: - if (ctype == cudf::io::parquet::UINT_64) { + case cudf::io::parquet::detail::INT64: + if (ctype == cudf::io::parquet::detail::UINT_64) { return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); } return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::FLOAT: + case cudf::io::parquet::detail::FLOAT: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::DOUBLE: + case cudf::io::parquet::detail::DOUBLE: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::BYTE_ARRAY: { + case cudf::io::parquet::detail::BYTE_ARRAY: { int32_t v1sz = v1.size(); int32_t v2sz = v2.size(); int32_t ret = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz)); @@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4164,10 +4169,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndex) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4210,7 +4215,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4255,10 +4260,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4311,7 +4316,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4362,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4403,7 +4408,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4458,9 +4463,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto c0 = testdata::ascending(); @@ -4495,7 +4500,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4542,9 +4547,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto validity2 = cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); @@ -4586,7 +4591,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4616,9 +4621,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; @@ -4711,7 +4716,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4812,7 +4817,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4870,7 +4875,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -5030,10 +5035,10 @@ TEST_F(ParquetReaderTest, NestedByteArray) cudf::io::write_parquet(out_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); std::vector md{ {}, @@ -5081,12 +5086,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats) auto result = cudf::io::read_parquet(in_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY); - EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]); auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]); @@ -5137,9 +5142,9 @@ TEST_F(ParquetReaderTest, StructByteArray) TEST_F(ParquetReaderTest, NestingOptimizationTest) { - // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep. + // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep. constexpr cudf::size_type num_nesting_levels = 16; - static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info); + static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info); constexpr cudf::size_type rows_per_level = 2; constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level; @@ -5206,13 +5211,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5252,13 +5257,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest) // make sure dictionary was not used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5303,13 +5308,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) // make sure dictionary was used as expected. col0 should use one, // col1 should not. auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5354,13 +5359,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest) // make sure dictionary was used for both columns auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5438,13 +5443,13 @@ TEST_P(ParquetSizedTest, DictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -6664,7 +6669,7 @@ TEST_F(ParquetWriterTest, PreserveNullability) TEST_P(ParquetV2Test, CheckEncodings) { - using cudf::io::parquet::Encoding; + using cudf::io::parquet::detail::Encoding; constexpr auto num_rows = 100'000; auto const is_v2 = GetParam(); @@ -6697,7 +6702,7 @@ TEST_P(ParquetV2Test, CheckEncodings) }; auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; @@ -6732,4 +6737,82 @@ TEST_P(ParquetV2Test, CheckEncodings) } } +TEST_F(ParquetReaderTest, RepeatedNoAnnotations) +{ + constexpr unsigned char repeated_bytes[] = { + 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x30, 0x15, 0x30, 0x4c, 0x15, 0x0c, 0x15, 0x00, 0x12, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x0a, 0x15, 0x0a, + 0x2c, 0x15, 0x0c, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x03, 0x88, 0xc6, 0x02, + 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, 0x15, + 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x15, 0x04, 0x15, + 0x40, 0x15, 0x40, 0x4c, 0x15, 0x08, 0x15, 0x00, 0x12, 0x00, 0x00, 0xe3, 0x0c, 0x23, 0x4b, 0x01, + 0x00, 0x00, 0x00, 0xc7, 0x35, 0x3a, 0x42, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x6b, 0x74, 0x84, 0x00, + 0x00, 0x00, 0x00, 0x55, 0xa1, 0xae, 0xc6, 0x00, 0x00, 0x00, 0x00, 0x15, 0x00, 0x15, 0x22, 0x15, + 0x22, 0x2c, 0x15, 0x10, 0x15, 0x10, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, 0x03, 0x90, 0xaa, 0x02, 0x03, 0x94, 0x03, 0x26, 0xda, 0x02, + 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, + 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, 0x6d, + 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, 0x02, + 0x26, 0xba, 0x01, 0x00, 0x00, 0x15, 0x04, 0x15, 0x24, 0x15, 0x24, 0x4c, 0x15, 0x04, 0x15, 0x00, + 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0x6f, 0x6d, 0x65, 0x06, 0x00, 0x00, 0x00, 0x6d, + 0x6f, 0x62, 0x69, 0x6c, 0x65, 0x15, 0x00, 0x15, 0x20, 0x15, 0x20, 0x2c, 0x15, 0x10, 0x15, 0x10, + 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0xc0, 0x03, 0x00, 0x00, 0x00, + 0x03, 0x90, 0xef, 0x01, 0x03, 0x04, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x15, 0x02, 0x19, 0x6c, + 0x48, 0x04, 0x75, 0x73, 0x65, 0x72, 0x15, 0x04, 0x00, 0x15, 0x02, 0x25, 0x00, 0x18, 0x02, 0x69, + 0x64, 0x00, 0x35, 0x02, 0x18, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, + 0x72, 0x73, 0x15, 0x02, 0x00, 0x35, 0x04, 0x18, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x15, 0x04, + 0x00, 0x15, 0x04, 0x25, 0x00, 0x18, 0x06, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x00, 0x15, 0x0c, + 0x25, 0x02, 0x18, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x25, 0x00, 0x00, 0x16, 0x00, 0x19, 0x1c, 0x19, + 0x3c, 0x26, 0x80, 0x01, 0x1c, 0x15, 0x02, 0x19, 0x25, 0x00, 0x10, 0x19, 0x18, 0x02, 0x69, 0x64, + 0x15, 0x00, 0x16, 0x0c, 0x16, 0x78, 0x16, 0x78, 0x26, 0x54, 0x26, 0x08, 0x00, 0x00, 0x26, 0xda, + 0x02, 0x1c, 0x15, 0x04, 0x19, 0x25, 0x00, 0x10, 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, + 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x06, 0x6e, 0x75, + 0x6d, 0x62, 0x65, 0x72, 0x15, 0x00, 0x16, 0x10, 0x16, 0xa0, 0x01, 0x16, 0xa0, 0x01, 0x26, 0x96, + 0x02, 0x26, 0xba, 0x01, 0x00, 0x00, 0x26, 0xcc, 0x04, 0x1c, 0x15, 0x0c, 0x19, 0x25, 0x00, 0x10, + 0x19, 0x38, 0x0c, 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x73, 0x05, + 0x70, 0x68, 0x6f, 0x6e, 0x65, 0x04, 0x6b, 0x69, 0x6e, 0x64, 0x15, 0x00, 0x16, 0x10, 0x16, 0x82, + 0x01, 0x16, 0x82, 0x01, 0x26, 0x8a, 0x04, 0x26, 0xca, 0x03, 0x00, 0x00, 0x16, 0x9a, 0x03, 0x16, + 0x0c, 0x00, 0x28, 0x49, 0x70, 0x61, 0x72, 0x71, 0x75, 0x65, 0x74, 0x2d, 0x72, 0x73, 0x20, 0x76, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x30, 0x2e, 0x33, 0x2e, 0x30, 0x20, 0x28, 0x62, 0x75, + 0x69, 0x6c, 0x64, 0x20, 0x62, 0x34, 0x35, 0x63, 0x65, 0x37, 0x63, 0x62, 0x61, 0x32, 0x31, 0x39, + 0x39, 0x66, 0x32, 0x32, 0x64, 0x39, 0x33, 0x32, 0x36, 0x39, 0x63, 0x31, 0x35, 0x30, 0x64, 0x38, + 0x61, 0x38, 0x33, 0x39, 0x31, 0x36, 0x63, 0x36, 0x39, 0x62, 0x35, 0x65, 0x29, 0x00, 0x32, 0x01, + 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; + + auto read_opts = cudf::io::parquet_reader_options::builder( + cudf::io::source_info{reinterpret_cast(repeated_bytes), sizeof(repeated_bytes)}); + auto result = cudf::io::read_parquet(read_opts); + + EXPECT_EQ(result.tbl->view().column(0).size(), 6); + EXPECT_EQ(result.tbl->view().num_columns(), 2); + + column_wrapper col0{1, 2, 3, 4, 5, 6}; + column_wrapper child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}}; + cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}}; + auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}}; + + auto list_offsets_column = + cudf::test::fixed_width_column_wrapper{0, 0, 0, 0, 1, 2, 5}.release(); + auto num_list_rows = list_offsets_column->size() - 1; + + auto mask = cudf::create_null_mask(6, cudf::mask_state::ALL_VALID); + cudf::set_null_mask(static_cast(mask.data()), 0, 2, false); + + auto list_col = cudf::make_lists_column( + num_list_rows, std::move(list_offsets_column), struct_col.release(), 2, std::move(mask)); + + std::vector> struct_children; + struct_children.push_back(std::move(list_col)); + + auto outer_struct = + cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}}; + table_view expected{{col0, outer_struct}}; + + CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp index d802c0c2706..f97bb7a5323 100644 --- a/cpp/tests/round/round_tests.cpp +++ b/cpp/tests/round/round_tests.cpp @@ -703,4 +703,83 @@ TEST_F(RoundTests, BoolTestHalfUp) EXPECT_THROW(cudf::round(input, -2, cudf::rounding_method::HALF_UP), cudf::logic_error); } +// Use __uint128_t for demonstration. +constexpr __uint128_t operator""_uint128_t(const char* s) +{ + __uint128_t ret = 0; + for (int i = 0; s[i] != '\0'; ++i) { + ret *= 10; + if ('0' <= s[i] && s[i] <= '9') { ret += s[i] - '0'; } + } + return ret; +} + +TEST_F(RoundTests, HalfEvenErrorsA) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.5 at scale -37 should round HALF_EVEN to 0, because 0 is an even number + auto const input = + fp_wrapper{{5000000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{0}, scale_type{0}}; + auto const result = cudf::round(input, 0, cudf::rounding_method::HALF_EVEN); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(RoundTests, HalfEvenErrorsB) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.125 at scale -37 should round HALF_EVEN to 0.12, because 2 is an even number + auto const input = + fp_wrapper{{1250000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{12}, scale_type{-2}}; + auto const result = cudf::round(input, 2, cudf::rounding_method::HALF_EVEN); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(RoundTests, HalfEvenErrorsC) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.0625 at scale -37 should round HALF_EVEN to 0.062, because 2 is an even number + auto const input = + fp_wrapper{{0625000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{62}, scale_type{-3}}; + auto const result = cudf::round(input, 3, cudf::rounding_method::HALF_EVEN); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(RoundTests, HalfUpErrorsA) +{ + using namespace numeric; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + { + // 0.25 at scale -37 should round HALF_UP to 0.3 + auto const input = + fp_wrapper{{2500000000000000000000000000000000000_uint128_t}, scale_type{-37}}; + auto const expected = fp_wrapper{{3}, scale_type{-1}}; + auto const result = cudf::round(input, 1, cudf::rounding_method::HALF_UP); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/streams/binaryop_test.cpp b/cpp/tests/streams/binaryop_test.cpp new file mode 100644 index 00000000000..2520aed0458 --- /dev/null +++ b/cpp/tests/streams/binaryop_test.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include +#include + +class BinaryopTest : public cudf::test::BaseFixture {}; + +TEST_F(BinaryopTest, ColumnColumn) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +TEST_F(BinaryopTest, ColumnScalar) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::numeric_scalar rhs{23, true, cudf::test::get_default_stream()}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +TEST_F(BinaryopTest, ScalarColumn) +{ + cudf::numeric_scalar lhs{42, true, cudf::test::get_default_stream()}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + cudf::binary_operation(lhs, + rhs, + cudf::binary_operator::ADD, + cudf::data_type(cudf::type_to_id()), + cudf::test::get_default_stream()); +} + +class BinaryopPTXTest : public BinaryopTest { + protected: + void SetUp() override + { + if (!can_do_runtime_jit()) { GTEST_SKIP() << "Skipping tests that require 11.5 runtime"; } + } +}; + +TEST_F(BinaryopPTXTest, ColumnColumnPTX) +{ + cudf::test::fixed_width_column_wrapper lhs{10, 20, 30, 40, 50}; + cudf::test::fixed_width_column_wrapper rhs{15, 25, 35, 45, 55}; + + // c = a*a*a + b*b + char const* ptx = + R"***( +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-24817639 +// Cuda compilation tools, release 10.0, V10.0.130 +// Based on LLVM 3.4svn +// + +.version 6.3 +.target sm_70 +.address_size 64 + + // .globl _ZN8__main__7add$241Eix +.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Eix; +.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Exx; + +.visible .func (.param .b32 func_retval0) _ZN8__main__7add$241Eix( + .param .b64 _ZN8__main__7add$241Eix_param_0, + .param .b32 _ZN8__main__7add$241Eix_param_1, + .param .b64 _ZN8__main__7add$241Eix_param_2 +) +{ + .reg .b32 %r<3>; + .reg .b64 %rd<8>; + + + ld.param.u64 %rd1, [_ZN8__main__7add$241Eix_param_0]; + ld.param.u32 %r1, [_ZN8__main__7add$241Eix_param_1]; + ld.param.u64 %rd2, [_ZN8__main__7add$241Eix_param_2]; + cvt.s64.s32 %rd3, %r1; + mul.wide.s32 %rd4, %r1, %r1; + mul.lo.s64 %rd5, %rd4, %rd3; + mul.lo.s64 %rd6, %rd2, %rd2; + add.s64 %rd7, %rd6, %rd5; + st.u64 [%rd1], %rd7; + mov.u32 %r2, 0; + st.param.b32 [func_retval0+0], %r2; + ret; +} + +)***"; + + cudf::binary_operation( + lhs, rhs, ptx, cudf::data_type(cudf::type_to_id()), cudf::test::get_default_stream()); + cudf::binary_operation(lhs, rhs, ptx, cudf::data_type(cudf::type_to_id())); +} diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp new file mode 100644 index 00000000000..e292b551d83 --- /dev/null +++ b/cpp/tests/streams/lists_test.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +class ListTest : public cudf::test::BaseFixture {}; + +TEST_F(ListTest, ConcatenateRows) +{ + cudf::test::lists_column_wrapper list_col_1{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::lists_column_wrapper list_col_2{{0, 1}, {2, 3}, {4, 5}}; + cudf::table_view lists_table({list_col_1, list_col_2}); + cudf::lists::concatenate_rows( + lists_table, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ConcatenateListElements) +{ + cudf::test::lists_column_wrapper ll_column{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}; + cudf::lists::concatenate_list_elements( + ll_column, cudf::lists::concatenate_null_policy::IGNORE, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsNulls) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::lists::contains_nulls(list_col, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::contains(list_col, search_key, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, ContainsSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::contains(list_col, search_keys, cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKey) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::numeric_scalar search_key(2, true, cudf::test::get_default_stream()); + cudf::lists::index_of(list_col, + search_key, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, IndexOfSearchKeys) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3}, {4, 5}}; + cudf::test::fixed_width_column_wrapper search_keys({1, 2, 3}); + cudf::lists::index_of(list_col, + search_keys, + cudf::lists::duplicate_find_option::FIND_FIRST, + cudf::test::get_default_stream()); +} + +TEST_F(ListTest, CountElements) +{ + cudf::test::lists_column_wrapper list_col{{0, 1}, {2, 3, 7}, {4, 5}}; + cudf::lists::count_elements(list_col, cudf::test::get_default_stream()); +} diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index f143983aded..f04bb832f09 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -246,6 +246,28 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } +TEST_F(StringsReplaceTest, ReplaceAdjacentMultiByteTarget) +{ + auto input = cudf::test::strings_column_wrapper({"ééééééé", "eéeéeée", "eeeeeee"}); + auto strings_view = cudf::strings_column_view(input); + // replace all occurrences of 'é' with 'e' + cudf::test::strings_column_wrapper expected({"eeeeeee", "eeeeeee", "eeeeeee"}); + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + + auto target = cudf::string_scalar("é", true, stream); + auto repl = cudf::string_scalar("e", true, stream); + auto results = cudf::strings::replace(strings_view, target, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( + strings_view, target, repl, -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceTest, ReplaceSlice) { std::vector h_strings{"Héllo", "thesé", nullptr, "ARE THE", "tést strings", ""}; diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp index 9506e1918c0..d565359a4ea 100644 --- a/cpp/tests/unary/cast_tests.cpp +++ b/cpp/tests/unary/cast_tests.cpp @@ -30,6 +30,8 @@ #include #include +#include + #include #include @@ -967,6 +969,44 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScale) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointTests, ValidateCastRescalePrecision) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + // This test is designed to protect against floating point conversion + // introducing errors in fixed-point arithmetic. The rescaling that occurs + // during casting to different scales should only use fixed-precision math. + // Realistically, we are only able to show precision failures due to floating + // conversion in a few very specific circumstances where dividing by specific + // powers of 10 works against us. Some examples: 10^23, 10^25, 10^26, 10^27, + // 10^30, 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation. + // For completeness and to ensure that we are not missing any other cases, we + // test casting to/from all scales in the range of each decimal type. Values + // that are powers of ten show this error more readily than non-powers of 10 + // because the rescaling factor is a power of 10, meaning that errors in + // division are more visible. + constexpr auto min_scale = -cuda::std::numeric_limits::digits10; + for (int input_scale = 0; input_scale >= min_scale; --input_scale) { + for (int result_scale = 0; result_scale >= min_scale; --result_scale) { + RepType input_value = 1; + for (int k = 0; k > input_scale; --k) { + input_value *= 10; + } + RepType result_value = 1; + for (int k = 0; k > result_scale; --k) { + result_value *= 10; + } + auto const input = fp_wrapper{{input_value}, scale_type{input_scale}}; + auto const expected = fp_wrapper{{result_value}, scale_type{result_scale}}; + auto const result = cudf::cast(input, make_fixed_point_data_type(result_scale)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + } +} + TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask) { using namespace numeric; diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp index 49764f22373..76d1f769856 100644 --- a/cpp/tests/unary/unary_ops_test.cpp +++ b/cpp/tests/unary/unary_ops_test.cpp @@ -24,6 +24,8 @@ #include +#include + template cudf::test::fixed_width_column_wrapper create_fixed_columns(cudf::size_type start, cudf::size_type size, @@ -372,4 +374,35 @@ TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryFloorLarge) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); } +TYPED_TEST(FixedPointUnaryTests, ValidateCeilFloorPrecision) +{ + using namespace numeric; + using decimalXX = TypeParam; + using RepType = cudf::device_storage_type_t; + using fp_wrapper = cudf::test::fixed_point_column_wrapper; + + // This test is designed to protect against floating point conversion + // introducing errors in fixed-point arithmetic. The rounding that occurs + // during ceil/floor should only use fixed-precision math. Realistically, + // we are only able to show precision failures due to floating conversion in + // a few very specific circumstances where dividing by specific powers of 10 + // works against us. Some examples: 10^23, 10^25, 10^26, 10^27, 10^30, + // 10^32, 10^36. See https://godbolt.org/z/cP1MddP8P for a derivation. For + // completeness and to ensure that we are not missing any other cases, we + // test all scales representable in the range of each decimal type. + constexpr auto min_scale = -cuda::std::numeric_limits::digits10; + for (int input_scale = 0; input_scale >= min_scale; --input_scale) { + RepType input_value = 1; + for (int k = 0; k > input_scale; --k) { + input_value *= 10; + } + auto const input = fp_wrapper{{input_value}, scale_type{input_scale}}; + auto const expected = fp_wrapper{{input_value}, scale_type{input_scale}}; + auto const ceil_result = cudf::unary_operation(input, cudf::unary_operator::CEIL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, ceil_result->view()); + auto const floor_result = cudf::unary_operation(input, cudf::unary_operator::FLOOR); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, floor_result->view()); + } +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/dependencies.yaml b/dependencies.yaml index c8ee66bd99f..c19e8765be3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -227,25 +227,9 @@ dependencies: # in sync with the version pinned in get_arrow.cmake. - libarrow==12.0.1.* - librdkafka>=1.9.0,<1.10.0a0 + # Align nvcomp version with rapids-cmake + - nvcomp==2.6.1 - spdlog>=1.11.0,<1.12 - specific: - - output_types: conda - matrices: - - matrix: - arch: x86_64 - packages: - # Align nvcomp version with rapids-cmake - # TODO: not yet available for aarch64 CUDA 12 - - &nvcomp nvcomp==2.6.1 - - matrix: - arch: aarch64 - cuda: "11.8" - packages: - - *nvcomp - # TODO: Fallback matrix for aarch64 CUDA 12. After migrating to nvcomp 3, - # all CUDA/arch combinations should be supported by existing packages. - - matrix: - packages: build_wheels: common: - output_types: pyproject diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 947659c290a..1b543b94589 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -107,8 +107,12 @@ if(${PYARROW_RESULT}) message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}") endif() -set(targets_using_arrow_headers interop avro csv orc json parquet) -foreach(target IN LISTS targets_using_arrow_headers) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 81949dbaa20..3d96f59c4d6 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -10,6 +10,7 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.filling cimport calendrical_month_sequence +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar @@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset): + offset.kwds.get("months", 0) ) + cdef const scalar* c_start = start.c_value.get() with nogil: c_result = move(calendrical_month_sequence( n, - start.c_value.get()[0], + c_start[0], months )) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 639754fc54f..8fd2a409d90 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -4,14 +4,7 @@ from cpython cimport pycapsule from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from pyarrow.lib cimport ( - CScalar, - CTable, - pyarrow_unwrap_scalar, - pyarrow_unwrap_table, - pyarrow_wrap_scalar, - pyarrow_wrap_table, -) +from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table from cudf._lib.cpp.interop cimport ( DLManagedTensor, @@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport ( to_arrow as cpp_to_arrow, to_dlpack as cpp_to_dlpack, ) -from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport type_id -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) -from cudf._lib.scalar cimport DeviceScalar from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype def from_dlpack(dlpack_capsule): @@ -199,79 +182,3 @@ def from_arrow(object input_table): c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0])) return columns_from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def to_arrow_scalar(DeviceScalar source_scalar): - """Convert a scalar to a PyArrow scalar. - - Parameters - ---------- - source_scalar : the scalar to convert - - Returns - ------- - pyarrow.lib.Scalar - """ - cdef vector[column_metadata] cpp_metadata = gather_metadata( - [("", source_scalar.dtype)] - ) - cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr() - - cdef shared_ptr[CScalar] cpp_arrow_scalar - with nogil: - cpp_arrow_scalar = cpp_to_arrow( - source_scalar_ptr[0], cpp_metadata[0] - ) - - return pyarrow_wrap_scalar(cpp_arrow_scalar) - - -@acquire_spill_lock() -def from_arrow_scalar(object input_scalar, output_dtype=None): - """Convert from PyArrow scalar to a cudf scalar. - - Parameters - ---------- - input_scalar : PyArrow scalar - output_dtype : output type to cast to, ignored except for decimals - - Returns - ------- - cudf._lib.DeviceScalar - """ - cdef shared_ptr[CScalar] cpp_arrow_scalar = ( - pyarrow_unwrap_scalar(input_scalar) - ) - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0])) - - cdef type_id ctype = c_result.get().type().id() - if ctype == type_id.DECIMAL128: - if output_dtype is None: - # Decimals must be cast to the cudf dtype of the right width - raise ValueError( - "Decimal scalars must be constructed with a dtype" - ) - - if isinstance(output_dtype, Decimal32Dtype): - c_result.reset( - new fixed_point_scalar[decimal32]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - elif isinstance(output_dtype, Decimal64Dtype): - c_result.reset( - new fixed_point_scalar[decimal64]( - ( c_result.get()).value(), - scale_type(-input_scalar.type.scale), - c_result.get().is_valid() - ) - ) - # Decimal128Dtype is a no-op, no conversion needed. - - return DeviceScalar.from_unique_ptr(move(c_result), output_dtype) diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index 515b9c1d6e4..d4e2392ee04 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0ce42dc43ff..5185b2d4bb5 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,10 +12,33 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx) +set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx + types.pyx utils.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) + +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" + OUTPUT_VARIABLE PYARROW_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() + +# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for +# more info +find_package(NumPy REQUIRED) +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. + # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index ba7822b0a54..7a35854392c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,9 +1,10 @@ # Copyright (c) 2023, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport copying +from . cimport copying, interop from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved @@ -12,7 +13,9 @@ from .types cimport DataType __all__ = [ "Column", "DataType", + "Scalar", "Table", "copying", "gpumemoryview", + "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 3edff9a53e8..72b74a57b87 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,16 +1,19 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from . import copying +from . import copying, interop from .column import Column from .gpumemoryview import gpumemoryview +from .scalar import Scalar from .table import Table from .types import DataType, TypeId __all__ = [ "Column", "DataType", + "Scalar", "Table", "TypeId", "copying", "gpumemoryview", + "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd new file mode 100644 index 00000000000..3a79e5425d4 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cudf._lib.cpp.interop cimport column_metadata + + +cdef class ColumnMetadata: + cdef public object name + cdef public object children_meta + cdef column_metadata to_libcudf(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx new file mode 100644 index 00000000000..0cdca275027 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -0,0 +1,23 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cudf._lib.cpp.interop cimport column_metadata + + +cdef class ColumnMetadata: + def __init__(self, name): + self.name = name + self.children_meta = [] + + cdef column_metadata to_libcudf(self): + """Convert to C++ column_metadata. + + Since this class is mutable and cheap, it is easier to create the C++ + object on the fly rather than have it directly backing the storage for + the Cython class. + """ + cdef column_metadata c_metadata + cdef ColumnMetadata child_meta + c_metadata.name = self.name.encode() + for child_meta in self.children_meta: + c_metadata.children_meta.push_back(child_meta.to_libcudf()) + return c_metadata diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd new file mode 100644 index 00000000000..09d853d832f --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -0,0 +1,32 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa + +from rmm._lib.memory_resource cimport DeviceMemoryResource + +from cudf._lib.cpp.scalar.scalar cimport scalar + +from .interop cimport ColumnMetadata +from .types cimport DataType + + +cdef class Scalar: + cdef unique_ptr[scalar] c_obj + cdef DataType _data_type + + # Holds a reference to the DeviceMemoryResource used for allocation. + # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is + # needed for deallocation + cdef DeviceMemoryResource mr + + cdef const scalar* get(self) except * + + cpdef DataType type(self) + cpdef bool is_valid(self) + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) + + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx new file mode 100644 index 00000000000..04f588bd3e6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -0,0 +1,133 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from cython cimport no_gc_clear +from cython.operator cimport dereference +from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.utility cimport move +from pyarrow cimport lib as pa + +from rmm._lib.memory_resource cimport get_current_device_resource + +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) +from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) + +from .interop cimport ColumnMetadata +from .types cimport DataType, type_id + + +# The DeviceMemoryResource attribute could be released prematurely +# by the gc if the Scalar is in a reference cycle. Removing the tp_clear +# function with the no_gc_clear decoration prevents that. See +# https://github.com/rapidsai/rmm/pull/931 for details. +@no_gc_clear +cdef class Scalar: + """A scalar value in device memory.""" + # Unlike for columns, libcudf does not support scalar views. All APIs that + # accept scalar values accept references to the owning object rather than a + # special view type. As a result, pylibcudf.Scalar has a simpler structure + # than pylibcudf.Column because it can be a true wrapper around a libcudf + # column + + def __cinit__(self, *args, **kwargs): + self.mr = get_current_device_resource() + + def __init__(self, pa.Scalar value=None): + # TODO: This case is not something we really want to + # support, but it here for now to ease the transition of + # DeviceScalar. + if value is not None: + raise ValueError("Scalar should be constructed with a factory") + + @staticmethod + def from_arrow(pa.Scalar value, DataType data_type=None): + # Allow passing a dtype, but only for the purpose of decimals for now + + cdef shared_ptr[pa.CScalar] cscalar = ( + pa.pyarrow_unwrap_scalar(value) + ) + cdef unique_ptr[scalar] c_result + + with nogil: + c_result = move(cpp_from_arrow(cscalar.get()[0])) + + cdef Scalar s = Scalar.from_libcudf(move(c_result)) + + if s.type().id() != type_id.DECIMAL128: + if data_type is not None: + raise ValueError( + "dtype may not be passed for non-decimal types" + ) + return s + + if data_type is None: + raise ValueError( + "Decimal scalars must be constructed with a dtype" + ) + + cdef type_id tid = data_type.id() + + if tid == type_id.DECIMAL32: + s.c_obj.reset( + new fixed_point_scalar[decimal32]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid == type_id.DECIMAL64: + s.c_obj.reset( + new fixed_point_scalar[decimal64]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid != type_id.DECIMAL128: + raise ValueError( + "Decimal scalars may only be cast to decimals" + ) + + return s + + cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata): + cdef shared_ptr[pa.CScalar] c_result + cdef column_metadata c_metadata = metadata.to_libcudf() + + with nogil: + c_result = move(cpp_to_arrow(dereference(self.c_obj.get()), c_metadata)) + + return pa.pyarrow_wrap_scalar(c_result) + + cdef const scalar* get(self) except *: + return self.c_obj.get() + + cpdef DataType type(self): + """The type of data in the column.""" + return self._data_type + + cpdef bool is_valid(self): + """True if the scalar is valid, false if not""" + return self.get().is_valid() + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None): + """Construct a Scalar object from a libcudf scalar. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef Scalar s = Scalar.__new__(Scalar) + s.c_obj.swap(libcudf_scalar) + s._data_type = DataType.from_libcudf(s.get().type()) + return s diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd index 95f197b13eb..a9e2874232a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pyarrow cimport lib as pa from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -16,3 +17,5 @@ cdef class Table: cdef Table from_libcudf(unique_ptr[table] libcudf_tbl) cpdef list columns(self) + + cpdef pa.Table to_arrow(self, list metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx index 720f9815bd6..c41eb82e4a1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/table.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx @@ -1,15 +1,22 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector +from pyarrow cimport lib as pa from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.interop cimport ( + column_metadata, + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) from cudf._lib.cpp.table.table cimport table from .column cimport Column +from .interop cimport ColumnMetadata cdef class Table: @@ -60,3 +67,27 @@ cdef class Table: cpdef list columns(self): return self._columns + + @staticmethod + def from_arrow(pa.Table pyarrow_table): + cdef shared_ptr[pa.CTable] ctable = ( + pa.pyarrow_unwrap_table(pyarrow_table) + ) + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_from_arrow(ctable.get()[0])) + + return Table.from_libcudf(move(c_result)) + + cpdef pa.Table to_arrow(self, list metadata): + cdef shared_ptr[pa.CTable] c_result + cdef vector[column_metadata] c_metadata + cdef ColumnMetadata meta + for meta in metadata: + c_metadata.push_back(meta.to_libcudf()) + + with nogil: + c_result = move(cpp_to_arrow(self.view(), c_metadata)) + + return pa.pyarrow_wrap_table(c_result) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 1deed60d67d..77733f59c3d 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,20 +1,19 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource +# TODO: Would like to remove this cimport, but it will require some more work +# to excise all C code in scalar.pyx that relies on using the C API of the +# pylibcudf Scalar underlying the DeviceScalar. +from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: - cdef unique_ptr[scalar] c_value - - # Holds a reference to the DeviceMemoryResource used for allocation. - # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is - # needed for deallocation - cdef DeviceMemoryResource mr + cdef pylibcudf.Scalar c_value cdef object _dtype diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 5ab286c5701..0b64c75f7b6 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -cimport cython - import copy import numpy as np @@ -13,17 +11,17 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from rmm._lib.memory_resource cimport get_current_device_resource - import cudf +from cudf._lib import pylibcudf from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ListDtype, StructDtype +from cudf.core.dtypes import ( + ListDtype, + StructDtype, + is_list_dtype, + is_struct_dtype, +) from cudf.core.missing import NA, NaT -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id - -from cudf._lib.interop import from_arrow_scalar, to_arrow_scalar - cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, @@ -44,6 +42,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) +from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id def _replace_nested(obj, check, replacement): @@ -61,15 +60,44 @@ def _replace_nested(obj, check, replacement): _replace_nested(v, check, replacement) -# The DeviceMemoryResource attribute could be released prematurely -# by the gc if the DeviceScalar is in a reference cycle. Removing -# the tp_clear function with the no_gc_clear decoration prevents that. -# See https://github.com/rapidsai/rmm/pull/931 for details. -@cython.no_gc_clear +def gather_metadata(dtypes): + """Convert a dict of dtypes to a list of ColumnMetadata objects. + + The metadata is constructed recursively so that nested types are + represented as nested ColumnMetadata objects. + + Parameters + ---------- + dtypes : dict + A dict mapping column names to dtypes. + + Returns + ------- + List[ColumnMetadata] + A list of ColumnMetadata objects. + """ + out = [] + for name, dtype in dtypes.items(): + v = pylibcudf.interop.ColumnMetadata(name) + if is_struct_dtype(dtype): + v.children_meta = gather_metadata(dtype.fields) + elif is_list_dtype(dtype): + # Offsets column is unnamed and has no children + v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) + v.children_meta.extend( + gather_metadata({"": dtype.element_type}) + ) + out.append(v) + return out + + cdef class DeviceScalar: + # TODO: I think this should be removable, except that currently the way + # that from_unique_ptr is implemented is probably dereferencing this in an + # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.mr = get_current_device_resource() + self.c_value = pylibcudf.Scalar() def __init__(self, value, dtype): """ @@ -85,7 +113,7 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') + dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') if cudf.utils.utils.is_na_like(value): value = None @@ -108,10 +136,17 @@ cdef class DeviceScalar: pa_scalar = pa.scalar(value, type=pa_type) - # Note: This factory-like behavior in __init__ will be removed when - # migrating to pylibcudf. - cdef DeviceScalar obj = from_arrow_scalar(pa_scalar, self._dtype) - self.c_value.swap(obj.c_value) + data_type = None + if isinstance(dtype, cudf.core.dtypes.DecimalDtype): + tid = pylibcudf.TypeId.DECIMAL128 + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): + tid = pylibcudf.TypeId.DECIMAL32 + elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): + tid = pylibcudf.TypeId.DECIMAL64 + data_type = pylibcudf.DataType(tid, -dtype.scale) + + self.c_value = pylibcudf.Scalar.from_arrow(pa_scalar, data_type) + self._dtype = dtype def _to_host_scalar(self): is_datetime = self.dtype.kind == "M" @@ -119,7 +154,8 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA - ps = to_arrow_scalar(self) + metadata = gather_metadata({"": self.dtype})[0] + ps = self.c_value.to_arrow(metadata) if not ps.is_valid: return null_type @@ -158,13 +194,13 @@ cdef class DeviceScalar: return self._to_host_scalar() cdef const scalar* get_raw_ptr(self) except *: - return self.c_value.get() + return self.c_value.c_obj.get() cpdef bool is_valid(self): """ Returns if the Scalar is valid or not(i.e., ). """ - return self.get_raw_ptr()[0].is_valid() + return self.c_value.is_valid() def __repr__(self): if cudf.utils.utils.is_na_like(self.value): @@ -183,7 +219,7 @@ cdef class DeviceScalar: cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) cdef libcudf_types.data_type cdtype - s.c_value = move(ptr) + s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) cdtype = s.get_raw_ptr()[0].type() if dtype is not None: @@ -310,9 +346,9 @@ def _create_proxy_nat_scalar(dtype): if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True) + _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True) + _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) return result else: raise TypeError('NAT only valid for datetime and timedelta') diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt index a5e87a456cb..fc11f047ab4 100644 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -40,6 +40,14 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() add_subdirectory(convert) add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt index 434f79d3b5f..f55bb1fb780 100644 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -22,3 +22,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt index 59a22c06e85..2f2063482af 100644 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -20,3 +20,11 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf ) +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That requirement will go away once all +# scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach()