From 8907a9020a4ff9e3361d6dcf84e4f12fb2f985c8 Mon Sep 17 00:00:00 2001 From: db Date: Fri, 6 Oct 2023 10:59:22 -0500 Subject: [PATCH 1/6] Cleanup of namespaces in parquet. The ::detail::parquet namespace has been changed to ::parquet::detail, ::parquet::gpu has been renamed to ::parquet::detail, and several detail-style files which were just using ::parquet have been moved into parquet::detail. --- cpp/include/cudf/io/detail/parquet.hpp | 8 +- cpp/include/cudf/io/parquet.hpp | 4 +- cpp/src/io/functions.cpp | 4 +- cpp/src/io/parquet/chunk_dict.cu | 12 +- .../io/parquet/compact_protocol_reader.cpp | 2 + .../io/parquet/compact_protocol_reader.hpp | 3 + .../io/parquet/compact_protocol_writer.cpp | 2 + .../io/parquet/compact_protocol_writer.hpp | 2 + cpp/src/io/parquet/decode_preprocess.cu | 4 +- cpp/src/io/parquet/delta_binary.cuh | 4 +- cpp/src/io/parquet/page_data.cu | 6 +- cpp/src/io/parquet/page_decode.cuh | 4 +- cpp/src/io/parquet/page_delta_decode.cu | 6 +- cpp/src/io/parquet/page_enc.cu | 16 +- cpp/src/io/parquet/page_hdr.cu | 7 +- cpp/src/io/parquet/page_string_decode.cu | 8 +- cpp/src/io/parquet/page_string_utils.cuh | 4 +- cpp/src/io/parquet/parquet.hpp | 3 + cpp/src/io/parquet/parquet_common.hpp | 3 + cpp/src/io/parquet/parquet_gpu.cuh | 4 +- cpp/src/io/parquet/parquet_gpu.hpp | 27 +- cpp/src/io/parquet/predicate_pushdown.cpp | 24 +- cpp/src/io/parquet/reader.cpp | 4 +- cpp/src/io/parquet/reader_impl.cpp | 36 +- cpp/src/io/parquet/reader_impl.hpp | 12 +- cpp/src/io/parquet/reader_impl_chunking.cu | 597 ++++++++++++++++++ cpp/src/io/parquet/reader_impl_helpers.cpp | 121 ++-- cpp/src/io/parquet/reader_impl_helpers.hpp | 21 +- cpp/src/io/parquet/reader_impl_preprocess.cu | 259 ++++---- cpp/src/io/parquet/rle_stream.cuh | 4 +- cpp/src/io/parquet/writer_impl.cu | 217 ++++--- cpp/src/io/parquet/writer_impl.hpp | 22 +- cpp/src/io/utilities/column_buffer.cpp | 10 +- cpp/tests/io/parquet_test.cpp | 207 +++--- 34 files changed, 1142 insertions(+), 525 deletions(-) create mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 074f690d2c7..0b8ee9676de 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -38,7 +38,7 @@ class parquet_reader_options; class parquet_writer_options; class chunked_parquet_writer_options; -namespace detail::parquet { +namespace parquet::detail { /** * @brief Class to read Parquet dataset data into columns. @@ -186,7 +186,7 @@ class writer { */ explicit writer(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -201,7 +201,7 @@ class writer { */ explicit writer(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -250,5 +250,5 @@ class writer { * metadata. */ parquet_metadata read_parquet_metadata(host_span const> sources); -} // namespace detail::parquet +} // namespace parquet::detail } // namespace cudf::io diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index deaf23d405a..06ac9caac75 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -499,7 +499,7 @@ class chunked_parquet_reader { [[nodiscard]] table_with_metadata read_chunk() const; private: - std::unique_ptr reader; + std::unique_ptr reader; }; /** @} */ // end of group @@ -1750,7 +1750,7 @@ class parquet_chunked_writer { std::vector const& column_chunks_file_paths = {}); /// Unique pointer to impl writer class - std::unique_ptr writer; + std::unique_ptr writer; }; /** @} */ // end of group diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 392a7850886..726442d752e 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -470,8 +470,8 @@ void orc_chunked_writer::close() writer->close(); } -using namespace cudf::io::detail::parquet; -namespace detail_parquet = cudf::io::detail::parquet; +using namespace cudf::io::parquet::detail; +namespace detail_parquet = cudf::io::parquet::detail; table_with_metadata read_parquet(parquet_reader_options const& options, rmm::mr::device_memory_resource* mr) diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 9ff1869edde..86678fe58d5 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -27,7 +27,7 @@ namespace cudf { namespace io { namespace parquet { -namespace gpu { +namespace detail { namespace { constexpr int DEFAULT_BLOCK_SIZE = 256; } @@ -101,7 +101,7 @@ struct map_find_fn { template __global__ void __launch_bounds__(block_size) - populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) + populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -226,7 +226,7 @@ __global__ void __launch_bounds__(block_size) template __global__ void __launch_bounds__(block_size) - get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) + get_dictionary_indices_kernel(cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; @@ -276,7 +276,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st <<>>(chunks); } -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); @@ -290,14 +290,14 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi collect_map_entries_kernel<<>>(chunks); } -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel <<>>(frags); } -} // namespace gpu +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index 5c7b8ca3f8c..bf4bdb47cec 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -24,6 +24,7 @@ namespace cudf { namespace io { namespace parquet { +namespace detail { /** * @brief Base class for parquet field functors. @@ -870,6 +871,7 @@ int CompactProtocolReader::WalkSchema( } } +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index 619815db503..77f8232ab7d 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -28,6 +28,8 @@ namespace cudf { namespace io { namespace parquet { +namespace detail { + /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata * @@ -147,6 +149,7 @@ class CompactProtocolReader { friend class parquet_field_struct_blob; }; +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp index 60bc8984d81..cca0ca83c25 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.cpp +++ b/cpp/src/io/parquet/compact_protocol_writer.cpp @@ -19,6 +19,7 @@ namespace cudf { namespace io { namespace parquet { +namespace detail { /** * @brief Parquet CompactProtocolWriter class @@ -391,6 +392,7 @@ inline void CompactProtocolFieldWriter::set_current_field(int const& field) current_field_value = field; } +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp index 26d66527aa5..345783e0451 100644 --- a/cpp/src/io/parquet/compact_protocol_writer.hpp +++ b/cpp/src/io/parquet/compact_protocol_writer.hpp @@ -28,6 +28,7 @@ namespace cudf { namespace io { namespace parquet { +namespace detail { /** * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata @@ -115,6 +116,7 @@ class CompactProtocolFieldWriter { inline void set_current_field(int const& field); }; +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 8de3702bc2e..6c2e435a1c3 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -26,7 +26,7 @@ namespace cudf { namespace io { namespace parquet { -namespace gpu { +namespace detail { namespace { @@ -411,7 +411,7 @@ void ComputePageSizes(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 2382e4aafdf..a513e6674b4 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -18,7 +18,7 @@ #include "page_decode.cuh" -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // DELTA_XXX encoding support // @@ -291,4 +291,4 @@ struct delta_binary_decoder { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index 230834632dd..436b8177ced 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -26,7 +26,7 @@ namespace cudf { namespace io { namespace parquet { -namespace gpu { +namespace detail { namespace { @@ -624,7 +624,7 @@ uint32_t GetAggregatedDecodeKernelMask(cudf::detail::hostdevice_vector } /** - * @copydoc cudf::io::parquet::gpu::DecodePageData + * @copydoc cudf::io::parquet::detail::DecodePageData */ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -648,7 +648,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_vector& pages, } } -} // namespace gpu +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index cdc29197eb3..d794e14d98b 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -24,7 +24,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { struct page_state_s { constexpr page_state_s() noexcept {} @@ -1384,4 +1384,4 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s, return true; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 2b78dead205..d25684a59f3 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -23,7 +23,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { namespace { @@ -160,7 +160,7 @@ __global__ void __launch_bounds__(96) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::DecodeDeltaBinary + * @copydoc cudf::io::parquet::detail::DecodeDeltaBinary */ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -184,4 +184,4 @@ void __host__ DecodeDeltaBinary(cudf::detail::hostdevice_vector& pages } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index fe0dbb85124..533c55ef41a 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -44,7 +44,7 @@ namespace cudf { namespace io { namespace parquet { -namespace gpu { +namespace detail { namespace { @@ -329,7 +329,7 @@ __global__ void __launch_bounds__(128) // blockDim {128,1,1} __global__ void __launch_bounds__(128) gpuInitPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -998,7 +998,7 @@ __device__ auto julian_days_with_time(int64_t v) // blockDim(128, 1, 1) template __global__ void __launch_bounds__(128, 8) - gpuEncodePages(device_span pages, + gpuEncodePages(device_span pages, device_span> comp_in, device_span> comp_out, device_span comp_results, @@ -1988,7 +1988,7 @@ __global__ void __launch_bounds__(128) // blockDim(1024, 1, 1) __global__ void __launch_bounds__(1024) - gpuGatherPages(device_span chunks, device_span pages) + gpuGatherPages(device_span chunks, device_span pages) { __shared__ __align__(8) EncColumnChunk ck_g; __shared__ __align__(8) EncPage page_g; @@ -2265,7 +2265,7 @@ void InitFragmentStatistics(device_span groups, } void InitEncoderPages(device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -2294,7 +2294,7 @@ void InitEncoderPages(device_2dspan chunks, write_v2_headers); } -void EncodePages(device_span pages, +void EncodePages(device_span pages, bool write_v2_headers, device_span> comp_in, device_span> comp_out, @@ -2328,7 +2328,7 @@ void EncodePageHeaders(device_span pages, } void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream) { gpuGatherPages<<>>(chunks, pages); @@ -2343,7 +2343,7 @@ void EncodeColumnIndexes(device_span chunks, chunks, column_stats, column_index_truncate_length); } -} // namespace gpu +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu index 6f8b2f50443..839a75c31ff 100644 --- a/cpp/src/io/parquet/page_hdr.cu +++ b/cpp/src/io/parquet/page_hdr.cu @@ -23,7 +23,7 @@ namespace cudf { namespace io { namespace parquet { -namespace gpu { +namespace detail { // Minimal thrift implementation for parsing page headers // https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md @@ -161,8 +161,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type) * @param chunk Column chunk the page belongs to * @return `kernel_mask_bits` value for the given page */ -__device__ uint32_t kernel_mask_for_page(gpu::PageInfo const& page, - gpu::ColumnChunkDesc const& chunk) +__device__ uint32_t kernel_mask_for_page(PageInfo const& page, ColumnChunkDesc const& chunk) { if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return 0; } @@ -528,7 +527,7 @@ void __host__ BuildStringDictionaryIndex(ColumnChunkDesc* chunks, gpuBuildStringDictionaryIndex<<>>(chunks, num_chunks); } -} // namespace gpu +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index d79abe4a6d2..cb9461dc9ce 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -23,7 +23,7 @@ namespace cudf { namespace io { namespace parquet { -namespace gpu { +namespace detail { namespace { @@ -757,7 +757,7 @@ __global__ void __launch_bounds__(decode_block_size) } // anonymous namespace /** - * @copydoc cudf::io::parquet::gpu::ComputePageStringSizes + * @copydoc cudf::io::parquet::detail::ComputePageStringSizes */ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -778,7 +778,7 @@ void ComputePageStringSizes(cudf::detail::hostdevice_vector& pages, } /** - * @copydoc cudf::io::parquet::gpu::DecodeStringPageData + * @copydoc cudf::io::parquet::detail::DecodeStringPageData */ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pages, cudf::detail::hostdevice_vector const& chunks, @@ -802,7 +802,7 @@ void __host__ DecodeStringPageData(cudf::detail::hostdevice_vector& pa } } -} // namespace gpu +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index 9395599b3ff..a81d0a64466 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -18,7 +18,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. // copies from src to dst in 16B chunks per thread. @@ -107,4 +107,4 @@ __device__ void block_excl_sum(size_type* arr, size_type length, size_type initi } } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index c2affc774c2..5a3bec9a185 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -28,6 +28,8 @@ namespace cudf { namespace io { namespace parquet { +namespace detail { + constexpr uint32_t parquet_magic = (('P' << 0) | ('A' << 8) | ('R' << 16) | ('1' << 24)); /** @@ -405,6 +407,7 @@ static inline int CountLeadingZeros32(uint32_t value) #endif } +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp index 5a1716bb547..2ac2c4388f3 100644 --- a/cpp/src/io/parquet/parquet_common.hpp +++ b/cpp/src/io/parquet/parquet_common.hpp @@ -21,6 +21,8 @@ namespace cudf { namespace io { namespace parquet { +namespace detail { + // Max decimal precisions according to the parquet spec: // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal auto constexpr MAX_DECIMAL32_PRECISION = 9; @@ -156,6 +158,7 @@ enum FieldType { ST_FLD_STRUCT = 12, }; +} // namespace detail } // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh index dc74bee1536..10e12ebb782 100644 --- a/cpp/src/io/parquet/parquet_gpu.cuh +++ b/cpp/src/io/parquet/parquet_gpu.cuh @@ -23,7 +23,7 @@ #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; @@ -81,4 +81,4 @@ inline size_type __device__ row_to_value_idx(size_type idx, return idx; } -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 51c862b376b..767668cc65e 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -35,7 +35,7 @@ #include -namespace cudf::io::parquet { +namespace cudf::io::parquet::detail { using cudf::io::detail::string_index_pair; @@ -88,8 +88,6 @@ struct input_column_info { auto nesting_depth() const { return nesting.size(); } }; -namespace gpu { - /** * @brief Enums for the flags in the page header */ @@ -347,7 +345,7 @@ struct file_intermediate_data { // all chunks from the selected row groups. We may end up reading these chunks progressively // instead of all at once - std::vector chunks{}; + std::vector chunks{}; // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we // may not be visiting every row group that contains these bounds @@ -372,16 +370,16 @@ struct pass_intermediate_data { // rowgroup, chunk and page information for the current pass. std::vector row_groups{}; - cudf::detail::hostdevice_vector chunks{}; - cudf::detail::hostdevice_vector pages_info{}; - cudf::detail::hostdevice_vector page_nesting_info{}; - cudf::detail::hostdevice_vector page_nesting_decode_info{}; + cudf::detail::hostdevice_vector chunks{}; + cudf::detail::hostdevice_vector pages_info{}; + cudf::detail::hostdevice_vector page_nesting_info{}; + cudf::detail::hostdevice_vector page_nesting_decode_info{}; rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; rmm::device_uvector page_index{0, rmm::cuda_stream_default}; rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; - std::vector output_chunk_read_info; + std::vector output_chunk_read_info; std::size_t current_output_chunk{0}; rmm::device_buffer level_decode_data{}; @@ -739,7 +737,7 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st * @param frags Column fragments * @param stream CUDA stream to use */ -void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, +void populate_chunk_hash_maps(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -762,7 +760,7 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi * @param frags Column fragments * @param stream CUDA stream to use */ -void get_dictionary_indices(cudf::detail::device_2dspan frags, +void get_dictionary_indices(cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -781,7 +779,7 @@ void get_dictionary_indices(cudf::detail::device_2dspan * @param[in] stream CUDA stream to use */ void InitEncoderPages(cudf::detail::device_2dspan chunks, - device_span pages, + device_span pages, device_span page_sizes, device_span comp_page_sizes, device_span col_desc, @@ -847,7 +845,7 @@ void EncodePageHeaders(device_span pages, * @param[in] stream CUDA stream to use */ void GatherPages(device_span chunks, - device_span pages, + device_span pages, rmm::cuda_stream_view stream); /** @@ -863,5 +861,4 @@ void EncodeColumnIndexes(device_span chunks, int32_t column_index_truncate_length, rmm::cuda_stream_view stream); -} // namespace gpu -} // namespace cudf::io::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 805d082c71e..ceb4c660dbc 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { /** @@ -62,13 +62,17 @@ struct stats_caster { // uses storage type as T template () or cudf::is_nested())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, + size_t stats_size, + cudf::io::parquet::detail::Type const type) { CUDF_FAIL("unsupported type for stats casting"); } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, + size_t stats_size, + cudf::io::parquet::detail::Type const type) { CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); return targetType(*reinterpret_cast(stats_val)); @@ -78,7 +82,9 @@ struct stats_caster { template () and !cudf::is_boolean()) or cudf::is_fixed_point() or cudf::is_chrono())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, + size_t stats_size, + cudf::io::parquet::detail::Type const type) { switch (type) { case INT32: return targetType(*reinterpret_cast(stats_val)); @@ -103,7 +109,9 @@ struct stats_caster { } template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, + size_t stats_size, + cudf::io::parquet::detail::Type const type) { switch (type) { case FLOAT: return targetType(*reinterpret_cast(stats_val)); @@ -113,7 +121,9 @@ struct stats_caster { } template )> - static T convert(uint8_t const* stats_val, size_t stats_size, cudf::io::parquet::Type const type) + static T convert(uint8_t const* stats_val, + size_t stats_size, + cudf::io::parquet::detail::Type const type) { switch (type) { case BYTE_ARRAY: [[fallthrough]]; @@ -527,4 +537,4 @@ named_to_reference_converter::visit_operands( return transformed_operands; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp index 1e87447006d..17d7c07bc91 100644 --- a/cpp/src/io/parquet/reader.cpp +++ b/cpp/src/io/parquet/reader.cpp @@ -16,7 +16,7 @@ #include "reader_impl.hpp" -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { reader::reader() = default; @@ -59,4 +59,4 @@ bool chunked_reader::has_next() const { return _impl->has_next(); } table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index ea40f29a070..34aa4f2201f 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -25,7 +25,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) { @@ -38,7 +38,7 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) CUDF_EXPECTS(pages.size() > 0, "There is no page to decode"); size_t const sum_max_depths = std::accumulate( - chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) { + chunks.begin(), chunks.end(), 0, [&](size_t cursum, ColumnChunkDesc const& chunk) { return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema); }); @@ -51,10 +51,10 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // doing a gather operation later on. // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). - auto const has_strings = (kernel_mask & gpu::KERNEL_MASK_STRING) != 0; + auto const has_strings = (kernel_mask & KERNEL_MASK_STRING) != 0; std::vector col_sizes(_input_columns.size(), 0L); if (has_strings) { - gpu::ComputePageStringSizes( + ComputePageStringSizes( pages, chunks, skip_rows, num_rows, _pass_itm_data->level_type_size, _stream); col_sizes = calculate_page_string_offsets(); @@ -176,19 +176,19 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) if (has_strings) { auto& stream = streams[s_idx++]; chunk_nested_str_data.host_to_device_async(stream); - gpu::DecodeStringPageData( + DecodeStringPageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), stream); } // launch delta binary decoder - if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) { - gpu::DecodeDeltaBinary( + if ((kernel_mask & KERNEL_MASK_DELTA_BINARY) != 0) { + DecodeDeltaBinary( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } // launch the catch-all page decoder - if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) { - gpu::DecodePageData( + if ((kernel_mask & KERNEL_MASK_GENERAL) != 0) { + DecodePageData( pages, chunks, num_rows, skip_rows, level_type_size, error_code.data(), streams[s_idx++]); } @@ -248,13 +248,13 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows) // update null counts in the final column buffers for (size_t idx = 0; idx < pages.size(); idx++) { - gpu::PageInfo* pi = &pages[idx]; - if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } - gpu::ColumnChunkDesc* col = &chunks[pi->chunk_idx]; + PageInfo* pi = &pages[idx]; + if (pi->flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } + ColumnChunkDesc* col = &chunks[pi->chunk_idx]; input_column_info const& input_col = _input_columns[col->src_col_index]; - int index = pi->nesting_decode - page_nesting_decode.device_ptr(); - gpu::PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; + int index = pi->nesting_decode - page_nesting_decode.device_ptr(); + PageNestingDecodeInfo* pndi = &page_nesting_decode[index]; auto* cols = &_output_buffers; for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) { @@ -320,7 +320,7 @@ reader::impl::impl(std::size_t chunk_read_limit, // Save the states of the output buffers for reuse in `chunk_read()`. for (auto const& buff : _output_buffers) { - _output_buffers_template.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -368,7 +368,7 @@ void reader::impl::prepare_data(int64_t skip_rows, // always create the pass struct, even if we end up with no passes. // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() && not _input_columns.empty() && _current_input_pass < num_passes) { @@ -521,7 +521,7 @@ table_with_metadata reader::impl::read_chunk() if (_chunk_count > 0) { _output_buffers.resize(0); for (auto const& buff : _output_buffers_template) { - _output_buffers.emplace_back(inline_column_buffer::empty_like(buff)); + _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff)); } } @@ -571,4 +571,4 @@ parquet_metadata read_parquet_metadata(host_span con metadata.get_key_value_metadata()[0]}; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 9445e4d1648..03990f1a1f3 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -35,7 +35,7 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { /** * @brief Implementation for Parquet reader @@ -261,10 +261,10 @@ class reader::impl { std::vector _input_columns; // Buffers for generating output columns - std::vector _output_buffers; + std::vector _output_buffers; // Buffers copied from `_output_buffers` after construction for reuse - std::vector _output_buffers_template; + std::vector _output_buffers_template; // _output_buffers associated schema indices std::vector _output_column_schemas; @@ -285,8 +285,8 @@ class reader::impl { // Within a pass, we produce one or more chunks of output, whose maximum total // byte size is controlled by _output_chunk_read_limit. - cudf::io::parquet::gpu::file_intermediate_data _file_itm_data; - std::unique_ptr _pass_itm_data; + cudf::io::parquet::detail::file_intermediate_data _file_itm_data; + std::unique_ptr _pass_itm_data; // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents // the start/end of the chunks to be loaded for a given pass. @@ -301,4 +301,4 @@ class reader::impl { bool _file_preprocessed{false}; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu new file mode 100644 index 00000000000..2c1521e46db --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -0,0 +1,597 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl.hpp" + +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +namespace { + +struct cumulative_row_info { + size_t row_count; // cumulative row count + size_t size_bytes; // cumulative size in bytes + int key; // schema index +}; + +#if defined(PREPROCESS_DEBUG) +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, + rmm::device_uvector const& page_index, + rmm::device_uvector const& c_info, + rmm::cuda_stream_view stream) +{ + pages.device_to_host_sync(stream); + + printf("------------\nCumulative sizes by page\n"); + + std::vector schemas(pages.size()); + std::vector h_page_index(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); + std::vector h_cinfo(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); + auto schema_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); + thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); + auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); + schemas.resize(last - schemas.begin()); + printf("Num schemas: %lu\n", schemas.size()); + + for (size_t idx = 0; idx < schemas.size(); idx++) { + printf("Schema %d\n", schemas[idx]); + for (size_t pidx = 0; pidx < pages.size(); pidx++) { + auto const& page = pages[h_page_index[pidx]]; + if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + continue; + } + printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); + } + } +} + +void print_cumulative_row_info( + host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) +{ + if (splits.has_value()) { + printf("------------\nSplits\n"); + for (size_t idx = 0; idx < splits->size(); idx++) { + printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); + } + } + + printf("------------\nCumulative sizes %s\n", label.c_str()); + for (size_t idx = 0; idx < sizes.size(); idx++) { + printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); + if (splits.has_value()) { + // if we have a split at this row count and this is the last instance of this row count + auto start = thrust::make_transform_iterator( + splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); + auto end = start + splits->size(); + auto split = std::find(start, end, sizes[idx].row_count); + auto const split_index = [&]() -> int { + if (split != end && + ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { + return static_cast(std::distance(start, split)); + } + return idx == 0 ? 0 : -1; + }(); + if (split_index >= 0) { + printf(" <-- split {%lu, %lu}", + splits.value()[split_index].skip_rows, + splits.value()[split_index].num_rows); + } + } + printf("\n"); + } +} +#endif // PREPROCESS_DEBUG + +/** + * @brief Functor which reduces two cumulative_row_info structs of the same key. + */ +struct cumulative_row_sum { + cumulative_row_info operator() + __device__(cumulative_row_info const& a, cumulative_row_info const& b) const + { + return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; + } +}; + +/** + * @brief Functor which computes the total data size for a given type of cudf column. + * + * In the case of strings, the return size does not include the chars themselves. That + * information is tracked separately (see PageInfo::str_bytes). + */ +struct row_size_functor { + __device__ size_t validity_size(size_t num_rows, bool nullable) + { + return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; + } + + template + __device__ size_t operator()(size_t num_rows, bool nullable) + { + auto const element_size = sizeof(device_storage_type_t); + return (element_size * num_rows) + validity_size(num_rows, nullable); + } +}; + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + auto const offset_size = sizeof(size_type); + // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset + // for the entire column, whereas this is adding an extra offset per page. So we will get a + // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better + // to overestimate size somewhat than to underestimate it and potentially generate chunks + // that are too large. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + return validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + // only returns the size of offsets and validity. the size of the actual string chars + // is tracked separately. + auto const offset_size = sizeof(size_type); + // see note about offsets in the list_view template. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +/** + * @brief Functor which computes the total output cudf data size for all of + * the data in this page. + * + * Sums across all nesting levels. + */ +struct get_cumulative_row_info { + gpu::PageInfo const* const pages; + + __device__ cumulative_row_info operator()(size_type index) + { + auto const& page = pages[index]; + if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + return cumulative_row_info{0, 0, page.src_col_schema}; + } + + // total nested size, not counting string data + auto iter = + cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { + auto const& pni = page.nesting[i]; + return cudf::type_dispatcher( + data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); + }); + + size_t const row_count = static_cast(page.nesting[0].size); + return { + row_count, + thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, + page.src_col_schema}; + } +}; + +/** + * @brief Functor which computes the effective size of all input columns by page. + * + * For a given row, we want to find the cost of all pages for all columns involved + * in loading up to that row. The complication here is that not all pages are the + * same size between columns. Example: + * + * page row counts + * Column A: 0 <----> 100 <----> 200 + * Column B: 0 <---------------> 200 <--------> 400 + | + * if we decide to split at row 100, we don't really know the actual amount of bytes in column B + * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + * page. Essentially, a conservative over-estimate of the real size. + */ +struct row_total_size { + cumulative_row_info const* c_info; + size_type const* key_offsets; + size_t num_keys; + + __device__ cumulative_row_info operator()(cumulative_row_info const& i) + { + // sum sizes for each input column at this row + size_t sum = 0; + for (int idx = 0; idx < num_keys; idx++) { + auto const start = key_offsets[idx]; + auto const end = key_offsets[idx + 1]; + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&] __device__(size_type i) { return c_info[i].row_count; }); + auto const page_index = + thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; + sum += c_info[page_index].size_bytes; + } + return {i.row_count, sum, i.key}; + } +}; + +/** + * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read + * limit, determine the set of splits. + * + * @param sizes Vector of cumulative {row_count, byte_size} pairs + * @param num_rows Total number of rows to read + * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns + */ +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) +{ + // now we have an array of {row_count, real output bytes}. just walk through it and generate + // splits. + // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch + // sizes are reasonably large, this shouldn't iterate too many times + std::vector splits; + { + size_t cur_pos = 0; + size_t cur_cumulative_size = 0; + size_t cur_row_count = 0; + auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { + return i.size_bytes - cur_cumulative_size; + }); + auto end = start + sizes.size(); + while (cur_row_count < num_rows) { + int64_t split_pos = + thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; + + // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back + // one. + if (static_cast(split_pos) >= sizes.size() || + (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { + split_pos--; + } + + // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in + // a loop because all of the cumulative sizes for all the pages are sorted into one big list. + // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in + // the list twice. so we have to iterate until we skip past all of them. The idea is that we + // either do this, or we have to call unique() on the input first. + while (split_pos < (static_cast(sizes.size()) - 1) && + (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { + split_pos++; + } + + auto const start_row = cur_row_count; + cur_row_count = sizes[split_pos].row_count; + splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); + cur_pos = split_pos; + cur_cumulative_size = sizes[split_pos].size_bytes; + } + } + // print_cumulative_row_info(sizes, "adjusted", splits); + + return splits; +} + +/** + * @brief Converts cuDF units to Parquet units. + * + * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + */ +[[nodiscard]] std::tuple conversion_info(type_id column_type_id, + type_id timestamp_type_id, + parquet::Type physical, + int8_t converted, + int32_t length) +{ + int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t clock_rate = 0; + if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { + type_width = 1; // I32 -> I8 + } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { + type_width = 2; // I32 -> I16 + } else if (column_type_id == type_id::INT32) { + type_width = 4; // str -> hash32 + } else if (is_chrono(data_type{column_type_id})) { + clock_rate = to_clockrate(timestamp_type_id); + } + + int8_t converted_type = converted; + if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && + not cudf::is_fixed_point(data_type{column_type_id})) { + converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal + } + return std::make_tuple(type_width, clock_rate, converted_type); +} + +/** + * @brief Return the required number of bits to store a value. + */ +template +[[nodiscard]] T required_bits(uint32_t max_level) +{ + return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); +} + +struct row_count_compare { + __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) + { + return a.row_count < b.row_count; + } +}; + +} // anonymous namespace + +void reader::impl::create_global_chunk_info() +{ + auto const num_rows = _file_itm_data.global_num_rows; + auto const& row_groups_info = _file_itm_data.row_groups; + auto& chunks = _file_itm_data.chunks; + + // Descriptors for all the chunks that make up the selected columns + auto const num_input_columns = _input_columns.size(); + auto const num_chunks = row_groups_info.size() * num_input_columns; + + // Initialize column chunk information + auto remaining_rows = num_rows; + for (auto const& rg : row_groups_info) { + auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); + auto const row_group_start = rg.start_row; + auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); + + // generate ColumnChunkDesc objects for everything to be decoded (all input columns) + for (size_t i = 0; i < num_input_columns; ++i) { + auto col = _input_columns[i]; + // look up metadata + auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); + auto& schema = _metadata->get_schema(col.schema_idx); + + auto [type_width, clock_rate, converted_type] = + conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), + _timestamp_type.id(), + schema.type, + schema.converted_type, + schema.type_length); + + chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); + } + + remaining_rows -= row_group_rows; + } +} + +void reader::impl::compute_input_passes() +{ + // at this point, row_groups has already been filtered down to just the row groups we need to + // handle optional skip_rows/num_rows parameters. + auto const& row_groups_info = _file_itm_data.row_groups; + + // if the user hasn't specified an input size limit, read everything in a single pass. + if (_input_pass_read_limit == 0) { + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + return; + } + + // generate passes. make sure to account for the case where a single row group doesn't fit within + // + std::size_t const read_limit = + _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); + std::size_t cur_pass_byte_size = 0; + std::size_t cur_rg_start = 0; + std::size_t cur_row_count = 0; + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_count.push_back(0); + + for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { + auto const& rgi = row_groups_info[cur_rg_index]; + auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); + + // can we add this row group + if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { + // A single row group (the current one) is larger than the read limit: + // We always need to include at least one row group, so end the pass at the end of the current + // row group + if (cur_rg_start == cur_rg_index) { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1); + _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows); + cur_rg_start = cur_rg_index + 1; + cur_pass_byte_size = 0; + } + // End the pass at the end of the previous row group + else { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + cur_rg_start = cur_rg_index; + cur_pass_byte_size = row_group.total_byte_size; + } + } else { + cur_pass_byte_size += row_group.total_byte_size; + } + cur_row_count += row_group.num_rows; + } + // add the last pass if necessary + if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) { + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + } +} + +void reader::impl::setup_next_pass() +{ + // this will also cause the previous pass information to be deleted + _pass_itm_data = std::make_unique(); + + // setup row groups to be loaded for this pass + auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass]; + auto const row_group_end = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1]; + auto const num_row_groups = row_group_end - row_group_start; + _pass_itm_data->row_groups.resize(num_row_groups); + std::copy(_file_itm_data.row_groups.begin() + row_group_start, + _file_itm_data.row_groups.begin() + row_group_end, + _pass_itm_data->row_groups.begin()); + + auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; + CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); + + auto const chunks_per_rowgroup = _input_columns.size(); + auto const num_chunks = chunks_per_rowgroup * num_row_groups; + + auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); + auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); + + _pass_itm_data->chunks = + cudf::detail::hostdevice_vector(num_chunks, _stream); + std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); + + // adjust skip_rows and num_rows by what's available in the row groups we are processing + if (num_passes == 1) { + _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; + _pass_itm_data->num_rows = _file_itm_data.global_num_rows; + } else { + auto const global_start_row = _file_itm_data.global_skip_rows; + auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; + auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row); + auto const end_row = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row); + + // skip_rows is always global in the sense that it is relative to the first row of + // everything we will be reading, regardless of what pass we are on. + // num_rows is how many rows we are reading this pass. + _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass]; + _pass_itm_data->num_rows = end_row - start_row; + } +} + +void reader::impl::compute_splits_for_pass() +{ + auto const skip_rows = _pass_itm_data->skip_rows; + auto const num_rows = _pass_itm_data->num_rows; + + // simple case : no chunk size, no splits + if(_output_chunk_read_limit <= 0){ + _pass_itm_data->output_chunk_read_info = std::vector{{skip_rows, num_rows}}; + return; + } + + auto& pages = _pass_itm_data->pages_info; + + auto const& page_keys = _pass_itm_data->page_keys; + auto const& page_index = _pass_itm_data->page_index; + + // generate cumulative row counts and sizes + rmm::device_uvector c_info(page_keys.size(), _stream); + // convert PageInfo to cumulative_row_info + auto page_input = thrust::make_transform_iterator(page_index.begin(), + get_cumulative_row_info{pages.device_ptr()}); + thrust::inclusive_scan_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + page_input, + c_info.begin(), + thrust::equal_to{}, + cumulative_row_sum{}); + // print_cumulative_page_info(pages, page_index, c_info, stream); + + // sort by row count + rmm::device_uvector c_info_sorted{c_info, _stream}; + thrust::sort(rmm::exec_policy(_stream), + c_info_sorted.begin(), + c_info_sorted.end(), + row_count_compare{}); + + // std::vector h_c_info_sorted(c_info_sorted.size()); + // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), + // c_info_sorted.data(), + // sizeof(cumulative_row_info) * c_info_sorted.size(), + // cudaMemcpyDefault)); + // print_cumulative_row_info(h_c_info_sorted, "raw"); + + // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per + // key + rmm::device_uvector key_offsets(page_keys.size() + 1, _stream); + auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + thrust::make_constant_iterator(1), + thrust::make_discard_iterator(), + key_offsets.begin()) + .second; + size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); + thrust::exclusive_scan( + rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); + + // adjust the cumulative info such that for each row count, the size includes any pages that span + // that row count. this is so that if we have this case: + // page row counts + // Column A: 0 <----> 100 <----> 200 + // Column B: 0 <---------------> 200 <--------> 400 + // | + // if we decide to split at row 100, we don't really know the actual amount of bytes in column B + // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + // page. + // + rmm::device_uvector aggregated_info(c_info.size(), _stream); + thrust::transform(rmm::exec_policy(_stream), + c_info_sorted.begin(), + c_info_sorted.end(), + aggregated_info.begin(), + row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); + + // bring back to the cpu + std::vector h_aggregated_info(aggregated_info.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), + aggregated_info.data(), + sizeof(cumulative_row_info) * c_info.size(), + cudaMemcpyDefault, + _stream.value())); + _stream.synchronize(); + + // generate the actual splits + _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index fcaa610fbb7..05158c3d299 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -21,34 +21,34 @@ #include #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { namespace { ConvertedType logical_type_to_converted_type(LogicalType const& logical) { if (logical.isset.STRING) { - return parquet::UTF8; + return UTF8; } else if (logical.isset.MAP) { - return parquet::MAP; + return MAP; } else if (logical.isset.LIST) { - return parquet::LIST; + return LIST; } else if (logical.isset.ENUM) { - return parquet::ENUM; + return ENUM; } else if (logical.isset.DECIMAL) { - return parquet::DECIMAL; // TODO set decimal values + return DECIMAL; // TODO set decimal values } else if (logical.isset.DATE) { - return parquet::DATE; + return DATE; } else if (logical.isset.TIME) { if (logical.TIME.unit.isset.MILLIS) - return parquet::TIME_MILLIS; + return TIME_MILLIS; else if (logical.TIME.unit.isset.MICROS) - return parquet::TIME_MICROS; + return TIME_MICROS; } else if (logical.isset.TIMESTAMP) { if (logical.TIMESTAMP.unit.isset.MILLIS) - return parquet::TIMESTAMP_MILLIS; + return TIMESTAMP_MILLIS; else if (logical.TIMESTAMP.unit.isset.MICROS) - return parquet::TIMESTAMP_MICROS; + return TIMESTAMP_MICROS; } else if (logical.isset.INTEGER) { switch (logical.INTEGER.bitWidth) { case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8; @@ -58,13 +58,13 @@ ConvertedType logical_type_to_converted_type(LogicalType const& logical) default: break; } } else if (logical.isset.UNKNOWN) { - return parquet::NA; + return NA; } else if (logical.isset.JSON) { - return parquet::JSON; + return JSON; } else if (logical.isset.BSON) { - return parquet::BSON; + return BSON; } - return parquet::UNKNOWN; + return UNKNOWN; } } // namespace @@ -76,39 +76,39 @@ type_id to_type_id(SchemaElement const& schema, bool strings_to_categorical, type_id timestamp_type_id) { - parquet::Type const physical = schema.type; - parquet::LogicalType const logical_type = schema.logical_type; - parquet::ConvertedType converted_type = schema.converted_type; - int32_t decimal_precision = schema.decimal_precision; + Type const physical = schema.type; + LogicalType const logical_type = schema.logical_type; + ConvertedType converted_type = schema.converted_type; + int32_t decimal_precision = schema.decimal_precision; // Logical type used for actual data interpretation; the legacy converted type // is superseded by 'logical' type whenever available. auto const inferred_converted_type = logical_type_to_converted_type(logical_type); - if (inferred_converted_type != parquet::UNKNOWN) { converted_type = inferred_converted_type; } - if (inferred_converted_type == parquet::DECIMAL) { + if (inferred_converted_type != UNKNOWN) { converted_type = inferred_converted_type; } + if (inferred_converted_type == DECIMAL) { decimal_precision = schema.logical_type.DECIMAL.precision; } switch (converted_type) { - case parquet::UINT_8: return type_id::UINT8; - case parquet::INT_8: return type_id::INT8; - case parquet::UINT_16: return type_id::UINT16; - case parquet::INT_16: return type_id::INT16; - case parquet::UINT_32: return type_id::UINT32; - case parquet::UINT_64: return type_id::UINT64; - case parquet::DATE: return type_id::TIMESTAMP_DAYS; - case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS; - case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS; - case parquet::TIMESTAMP_MILLIS: + case UINT_8: return type_id::UINT8; + case INT_8: return type_id::INT8; + case UINT_16: return type_id::UINT16; + case INT_16: return type_id::INT16; + case UINT_32: return type_id::UINT32; + case UINT_64: return type_id::UINT64; + case DATE: return type_id::TIMESTAMP_DAYS; + case TIME_MILLIS: return type_id::DURATION_MILLISECONDS; + case TIME_MICROS: return type_id::DURATION_MICROSECONDS; + case TIMESTAMP_MILLIS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MILLISECONDS; - case parquet::TIMESTAMP_MICROS: + case TIMESTAMP_MICROS: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_MICROSECONDS; - case parquet::DECIMAL: - if (physical == parquet::INT32) { return type_id::DECIMAL32; } - if (physical == parquet::INT64) { return type_id::DECIMAL64; } - if (physical == parquet::FIXED_LEN_BYTE_ARRAY) { + case DECIMAL: + if (physical == INT32) { return type_id::DECIMAL32; } + if (physical == INT64) { return type_id::DECIMAL64; } + if (physical == FIXED_LEN_BYTE_ARRAY) { if (schema.type_length <= static_cast(sizeof(int32_t))) { return type_id::DECIMAL32; } @@ -119,7 +119,7 @@ type_id to_type_id(SchemaElement const& schema, return type_id::DECIMAL128; } } - if (physical == parquet::BYTE_ARRAY) { + if (physical == BYTE_ARRAY) { CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision"); if (decimal_precision <= MAX_DECIMAL32_PRECISION) { return type_id::DECIMAL32; @@ -133,20 +133,20 @@ type_id to_type_id(SchemaElement const& schema, break; // maps are just List>. - case parquet::MAP: - case parquet::LIST: return type_id::LIST; - case parquet::NA: return type_id::STRING; + case MAP: + case LIST: return type_id::LIST; + case NA: return type_id::STRING; // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support default: break; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIMESTAMP.unit.isset.NANOS) { return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; } - if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and + if (inferred_converted_type == UNKNOWN and physical == INT64 and logical_type.TIME.unit.isset.NANOS) { return type_id::DURATION_NANOSECONDS; } @@ -157,16 +157,16 @@ type_id to_type_id(SchemaElement const& schema, // Physical storage type supported by Parquet; controls the on-disk storage // format in combination with the encoding type. switch (physical) { - case parquet::BOOLEAN: return type_id::BOOL8; - case parquet::INT32: return type_id::INT32; - case parquet::INT64: return type_id::INT64; - case parquet::FLOAT: return type_id::FLOAT32; - case parquet::DOUBLE: return type_id::FLOAT64; - case parquet::BYTE_ARRAY: - case parquet::FIXED_LEN_BYTE_ARRAY: + case BOOLEAN: return type_id::BOOL8; + case INT32: return type_id::INT32; + case INT64: return type_id::INT64; + case FLOAT: return type_id::FLOAT32; + case DOUBLE: return type_id::FLOAT64; + case BYTE_ARRAY: + case FIXED_LEN_BYTE_ARRAY: // Can be mapped to INT32 (32-bit hash) or STRING return strings_to_categorical ? type_id::INT32 : type_id::STRING; - case parquet::INT96: + case INT96: return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id : type_id::TIMESTAMP_NANOSECONDS; default: break; @@ -344,7 +344,7 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con return names; } -std::tuple> +std::tuple> aggregate_reader_metadata::select_row_groups( host_span const> row_group_indices, int64_t skip_rows_opt, @@ -362,7 +362,7 @@ aggregate_reader_metadata::select_row_groups( host_span const>(filtered_row_group_indices.value()); } } - std::vector selection; + std::vector selection; auto [rows_to_skip, rows_to_read] = [&]() { if (not row_group_indices.empty()) { return std::pair{}; } auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options( @@ -402,7 +402,7 @@ aggregate_reader_metadata::select_row_groups( } std::tuple, - std::vector, + std::vector, std::vector> aggregate_reader_metadata::select_columns(std::optional> const& use_names, bool include_index, @@ -420,17 +420,18 @@ aggregate_reader_metadata::select_columns(std::optional : -1; }; - std::vector output_columns; + std::vector output_columns; std::vector input_columns; std::vector nesting; // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is // not a child of "struct1" then the function will return false for "struct1" - std::function&, bool)> + std::function&, bool)> build_column = [&](column_name_info const* col_name_info, int schema_idx, - std::vector& out_col_array, + std::vector& out_col_array, bool has_list_parent) { if (schema_idx < 0) { return false; } auto const& schema_elem = get_schema(schema_idx); @@ -451,7 +452,8 @@ aggregate_reader_metadata::select_columns(std::optional : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const dtype = to_data_type(col_type, schema_elem); - inline_column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer output_col(dtype, + schema_elem.repetition_type == OPTIONAL); if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } // store the index of this element if inserted in out_col_array nesting.push_back(static_cast(out_col_array.size())); @@ -491,7 +493,8 @@ aggregate_reader_metadata::select_columns(std::optional to_type_id(schema_elem, strings_to_categorical, timestamp_type_id); auto const element_dtype = to_data_type(element_type, schema_elem); - inline_column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL); + cudf::io::detail::inline_column_buffer element_col( + element_dtype, schema_elem.repetition_type == OPTIONAL); if (has_list_parent || col_type == type_id::LIST) { element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; } @@ -656,4 +659,4 @@ aggregate_reader_metadata::select_columns(std::optional std::move(input_columns), std::move(output_columns), std::move(output_column_schemas)); } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 61e4f94df0f..2ff18bfbe7e 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -32,9 +32,7 @@ #include #include -namespace cudf::io::detail::parquet { - -using namespace cudf::io::parquet; +namespace cudf::io::parquet::detail { /** * @brief Function that translates Parquet datatype to cuDF type enum @@ -181,7 +179,7 @@ class aggregate_reader_metadata { * @return A tuple of corrected row_start, row_count and list of row group indexes and its * starting row */ - [[nodiscard]] std::tuple> select_row_groups( + [[nodiscard]] std::tuple> select_row_groups( host_span const> row_group_indices, int64_t row_start, std::optional const& row_count, @@ -201,12 +199,13 @@ class aggregate_reader_metadata { * @return input column information, output column information, list of output column schema * indices */ - [[nodiscard]] std:: - tuple, std::vector, std::vector> - select_columns(std::optional> const& use_names, - bool include_index, - bool strings_to_categorical, - type_id timestamp_type_id) const; + [[nodiscard]] std::tuple, + std::vector, + std::vector> + select_columns(std::optional> const& use_names, + bool include_index, + bool strings_to_categorical, + type_id timestamp_type_id) const; }; /** @@ -275,4 +274,4 @@ class named_to_reference_converter : public ast::detail::expression_transformer std::list _operators; }; -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index c731c467f2c..4bc6bb6f43b 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -43,7 +43,8 @@ #include -namespace cudf::io::detail::parquet { +namespace cudf::io::parquet::detail { + namespace { /** @@ -185,11 +186,11 @@ template */ [[nodiscard]] std::tuple conversion_info(type_id column_type_id, type_id timestamp_type_id, - parquet::Type physical, + Type physical, int8_t converted, int32_t length) { - int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; int32_t clock_rate = 0; if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { type_width = 1; // I32 -> I8 @@ -202,9 +203,9 @@ template } int8_t converted_type = converted; - if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && + if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal + converted_type = UNKNOWN; // Not converting to float64 or decimal } return std::make_tuple(type_width, clock_rate, converted_type); } @@ -226,7 +227,7 @@ template [[nodiscard]] std::future read_column_chunks_async( std::vector> const& sources, std::vector>& page_data, - cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, std::vector const& column_chunk_offsets, @@ -239,11 +240,10 @@ template size_t const io_offset = column_chunk_offsets[chunk]; size_t io_size = chunks[chunk].compressed_size; size_t next_chunk = chunk + 1; - bool const is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED); + bool const is_compressed = (chunks[chunk].codec != Compression::UNCOMPRESSED); while (next_chunk < end_chunk) { - size_t const next_offset = column_chunk_offsets[next_chunk]; - bool const is_next_compressed = - (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED); + size_t const next_offset = column_chunk_offsets[next_chunk]; + bool const is_next_compressed = (chunks[next_chunk].codec != Compression::UNCOMPRESSED); if (next_offset != io_offset + io_size || is_next_compressed != is_compressed || chunk_source_map[chunk] != chunk_source_map[next_chunk]) { // Can't merge if not contiguous or mixing compressed and uncompressed @@ -300,13 +300,13 @@ template * * @return The total number of pages */ -[[nodiscard]] size_t count_page_headers( - cudf::detail::hostdevice_vector& chunks, rmm::cuda_stream_view stream) +[[nodiscard]] size_t count_page_headers(cudf::detail::hostdevice_vector& chunks, + rmm::cuda_stream_view stream) { size_t total_pages = 0; chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); chunks.device_to_host_sync(stream); for (size_t c = 0; c < chunks.size(); c++) { @@ -337,8 +337,8 @@ constexpr bool is_supported_encoding(Encoding enc) * @param stream CUDA stream used for device memory operations and kernel launches * @returns The size in bytes of level type data required */ -int decode_page_headers(cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, +int decode_page_headers(cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages), @@ -350,14 +350,14 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c } chunks.host_to_device_async(stream); - gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); + DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream); // compute max bytes needed for level data auto level_bit_size = cudf::detail::make_counting_transform_iterator(0, [chunks = chunks.begin()] __device__(int i) { auto c = chunks[i]; return static_cast( - max(c.level_bits[gpu::level_type::REPETITION], c.level_bits[gpu::level_type::DEFINITION])); + max(c.level_bits[level_type::REPETITION], c.level_bits[level_type::DEFINITION])); }); // max level data bit size. int const max_level_bits = thrust::reduce(rmm::exec_policy(stream), @@ -388,11 +388,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c * @return Device buffer to decompressed page data */ [[nodiscard]] rmm::device_buffer decompress_page_data( - cudf::detail::hostdevice_vector& chunks, - cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector& chunks, + cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view stream) { - auto for_each_codec_page = [&](parquet::Compression codec, std::function const& f) { + auto for_each_codec_page = [&](Compression codec, std::function const& f) { for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { const auto page_stride = chunks[c].max_num_pages; if (chunks[c].codec == codec) { @@ -412,19 +412,16 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c size_t total_decomp_size = 0; struct codec_stats { - parquet::Compression compression_type = UNCOMPRESSED; - size_t num_pages = 0; - int32_t max_decompressed_size = 0; - size_t total_decomp_size = 0; + Compression compression_type = UNCOMPRESSED; + size_t num_pages = 0; + int32_t max_decompressed_size = 0; + size_t total_decomp_size = 0; }; - std::array codecs{codec_stats{parquet::GZIP}, - codec_stats{parquet::SNAPPY}, - codec_stats{parquet::BROTLI}, - codec_stats{parquet::ZSTD}}; + std::array codecs{codec_stats{GZIP}, codec_stats{SNAPPY}, codec_stats{BROTLI}, codec_stats{ZSTD}}; auto is_codec_supported = [&codecs](int8_t codec) { - if (codec == parquet::UNCOMPRESSED) return true; + if (codec == UNCOMPRESSED) return true; return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) { return codec == cstats.compression_type; }) != codecs.end(); @@ -445,7 +442,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.num_pages++; num_comp_pages++; }); - if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) { + if (codec.compression_type == BROTLI && codec.num_pages > 0) { debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream); } } @@ -482,7 +479,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c auto& page = pages[page_idx]; // offset will only be non-zero for V2 pages auto const offset = - page.lvl_bytes[gpu::level_type::DEFINITION] + page.lvl_bytes[gpu::level_type::REPETITION]; + page.lvl_bytes[level_type::DEFINITION] + page.lvl_bytes[level_type::REPETITION]; // for V2 need to copy def and rep level info into place, and then offset the // input and output buffers. otherwise we'd have to keep both the compressed // and decompressed data. @@ -509,11 +506,11 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { - case parquet::GZIP: + case GZIP: gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); break; - case parquet::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { + case SNAPPY: + if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, d_comp_in, d_comp_out, @@ -525,7 +522,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); } break; - case parquet::ZSTD: + case ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, d_comp_in, d_comp_out, @@ -534,7 +531,7 @@ int decode_page_headers(cudf::detail::hostdevice_vector& c codec.total_decomp_size, stream); break; - case parquet::BROTLI: + case BROTLI: gpu_debrotli(d_comp_in, d_comp_out, d_comp_res_view, @@ -594,9 +591,9 @@ void reader::impl::allocate_nesting_info() }); page_nesting_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; page_nesting_decode_info = - cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; + cudf::detail::hostdevice_vector{total_page_nesting_infos, _stream}; // update pointers in the PageInfos int target_page_index = 0; @@ -653,10 +650,10 @@ void reader::impl::allocate_nesting_info() if (!cur_schema.is_stub()) { // initialize each page within the chunk for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) { - gpu::PageNestingInfo* pni = + PageNestingInfo* pni = &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; - gpu::PageNestingDecodeInfo* nesting_info = + PageNestingDecodeInfo* nesting_info = &page_nesting_decode_info[nesting_info_index + (p_idx * per_page_nesting_info_size)]; // if we have lists, set our start and end depth remappings @@ -717,9 +714,9 @@ void reader::impl::allocate_level_decode_space() for (size_t idx = 0; idx < pages.size(); idx++) { auto& p = pages[idx]; - p.lvl_decode_buf[gpu::level_type::DEFINITION] = buf; + p.lvl_decode_buf[level_type::DEFINITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); - p.lvl_decode_buf[gpu::level_type::REPETITION] = buf; + p.lvl_decode_buf[level_type::REPETITION] = buf; buf += (LEVEL_DECODE_BUF_SIZE * _pass_itm_data->level_type_size); } } @@ -824,25 +821,25 @@ void reader::impl::load_global_chunk_info() schema.converted_type, schema.type_length); - chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); + chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); } remaining_rows -= row_group_rows; @@ -909,7 +906,7 @@ void reader::impl::compute_input_pass_row_group_info() void reader::impl::setup_pass() { // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); + _pass_itm_data = std::make_unique(); // setup row groups to be loaded for this pass auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass]; @@ -929,8 +926,7 @@ void reader::impl::setup_pass() auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - _pass_itm_data->chunks = - cudf::detail::hostdevice_vector(num_chunks, _stream); + _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); // adjust skip_rows and num_rows by what's available in the row groups we are processing @@ -970,7 +966,7 @@ void reader::impl::load_and_decompress_data() // Process dataset chunk pages into output columns auto const total_pages = count_page_headers(chunks, _stream); if (total_pages <= 0) { return; } - pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); + pages = cudf::detail::hostdevice_vector(total_pages, total_pages, _stream); // decoding of column/page information _pass_itm_data->level_type_size = decode_page_headers(chunks, pages, _stream); @@ -978,7 +974,7 @@ void reader::impl::load_and_decompress_data() decomp_page_data = decompress_page_data(chunks, pages, _stream); // Free compressed data for (size_t c = 0; c < chunks.size(); c++) { - if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } + if (chunks[c].codec != Compression::UNCOMPRESSED) { raw_page_data[c].reset(); } } } @@ -1019,14 +1015,13 @@ struct cumulative_row_info { }; #if defined(PREPROCESS_DEBUG) -void print_pages(cudf::detail::hostdevice_vector& pages, - rmm::cuda_stream_view _stream) +void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_stream_view _stream) { pages.device_to_host_sync(_stream); for (size_t idx = 0; idx < pages.size(); idx++) { auto const& p = pages[idx]; // skip dictionary pages - if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; } + if (p.flags & PAGEINFO_FLAGS_DICTIONARY) { continue; } printf( "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d), " "str_bytes(%d)\n", @@ -1040,7 +1035,7 @@ void print_pages(cudf::detail::hostdevice_vector& pages, } } -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, rmm::device_uvector const& page_index, rmm::device_uvector const& c_info, rmm::cuda_stream_view stream) @@ -1067,7 +1062,7 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& printf("Schema %d\n", schemas[idx]); for (size_t pidx = 0; pidx < pages.size(); pidx++) { auto const& page = pages[h_page_index[pidx]]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { continue; } printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); @@ -1075,10 +1070,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& } } -void print_cumulative_row_info( - host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) +void print_cumulative_row_info(host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) { if (splits.has_value()) { printf("------------\nSplits\n"); @@ -1093,7 +1087,7 @@ void print_cumulative_row_info( if (splits.has_value()) { // if we have a split at this row count and this is the last instance of this row count auto start = thrust::make_transform_iterator( - splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); + splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); auto end = start + splits->size(); auto split = std::find(start, end, sizes[idx].row_count); auto const split_index = [&]() -> int { @@ -1180,12 +1174,12 @@ __device__ size_t row_size_functor::operator()(size_t num_rows, boo * Sums across all nesting levels. */ struct get_cumulative_row_info { - gpu::PageInfo const* const pages; + PageInfo const* const pages; __device__ cumulative_row_info operator()(size_type index) { auto const& page = pages[index]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return cumulative_row_info{0, 0, page.src_col_schema}; } @@ -1250,15 +1244,15 @@ struct row_total_size { * @param num_rows Total number of rows to read * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) { // now we have an array of {row_count, real output bytes}. just walk through it and generate // splits. // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; + std::vector splits; { size_t cur_pos = 0; size_t cur_cumulative_size = 0; @@ -1290,7 +1284,7 @@ std::vector find_splits(std::vector c auto const start_row = cur_row_count; cur_row_count = sizes[split_pos].row_count; - splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); + splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); cur_pos = split_pos; cur_cumulative_size = sizes[split_pos].size_bytes; } @@ -1311,12 +1305,11 @@ std::vector find_splits(std::vector c * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns * @param stream CUDA stream to use */ -std::vector compute_splits( - cudf::detail::hostdevice_vector& pages, - gpu::pass_intermediate_data const& id, - size_t num_rows, - size_t chunk_read_limit, - rmm::cuda_stream_view stream) +std::vector compute_splits(cudf::detail::hostdevice_vector& pages, + pass_intermediate_data const& id, + size_t num_rows, + size_t chunk_read_limit, + rmm::cuda_stream_view stream) { auto const& page_keys = id.page_keys; auto const& page_index = id.page_index; @@ -1395,16 +1388,16 @@ std::vector compute_splits( } struct get_page_chunk_idx { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; } + __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; } }; struct get_page_num_rows { - __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; } + __device__ size_type operator()(PageInfo const& page) { return page.num_rows; } }; struct get_page_column_index { - gpu::ColumnChunkDesc const* chunks; - __device__ size_type operator()(gpu::PageInfo const& page) + ColumnChunkDesc const* chunks; + __device__ size_type operator()(PageInfo const& page) { return chunks[page.chunk_idx].src_col_index; } @@ -1441,7 +1434,7 @@ struct get_page_nesting_size { input_col_info const* const input_cols; size_type const max_depth; size_t const num_pages; - gpu::PageInfo const* const pages; + PageInfo const* const pages; int const* page_indices; __device__ size_type operator()(size_t index) const @@ -1450,7 +1443,7 @@ struct get_page_nesting_size { auto const& page = pages[page_indices[indices.page_idx]]; if (page.src_col_schema != input_cols[indices.col_idx].schema_idx || - page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + page.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return 0; } @@ -1468,7 +1461,7 @@ struct get_reduction_key { * @brief Writes to the chunk_row field of the PageInfo struct. */ struct chunk_row_output_iter { - gpu::PageInfo* p; + PageInfo* p; using value_type = size_type; using difference_type = size_type; using pointer = size_type*; @@ -1490,7 +1483,7 @@ struct chunk_row_output_iter { * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema. */ struct start_offset_output_iterator { - gpu::PageInfo const* pages; + PageInfo const* pages; int const* page_indices; size_t cur_index; input_col_info const* input_cols; @@ -1529,9 +1522,9 @@ struct start_offset_output_iterator { { auto const indices = reduction_indices{index, max_depth, num_pages}; - gpu::PageInfo const& p = pages[page_indices[indices.page_idx]]; + PageInfo const& p = pages[page_indices[indices.page_idx]]; if (p.src_col_schema != input_cols[indices.col_idx].schema_idx || - p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || + p.flags & PAGEINFO_FLAGS_DICTIONARY || indices.depth_idx >= input_cols[indices.col_idx].nesting_depth) { return empty; } @@ -1540,15 +1533,15 @@ struct start_offset_output_iterator { }; struct flat_column_num_rows { - gpu::PageInfo const* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo const* pages; + ColumnChunkDesc const* chunks; __device__ size_type operator()(size_type pindex) const { - gpu::PageInfo const& page = pages[pindex]; + PageInfo const& page = pages[pindex]; // ignore dictionary pages and pages belonging to any column containing repetition (lists) - if ((page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) || - (chunks[page.chunk_idx].max_level[gpu::level_type::REPETITION] > 0)) { + if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) || + (chunks[page.chunk_idx].max_level[level_type::REPETITION] > 0)) { return 0; } return page.num_rows; @@ -1581,8 +1574,8 @@ struct row_counts_different { * @param expected_row_count Expected row count, if applicable * @param stream CUDA stream used for device memory operations and kernel launches */ -void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, - cudf::detail::hostdevice_vector const& chunks, +void detect_malformed_pages(cudf::detail::hostdevice_vector& pages, + cudf::detail::hostdevice_vector const& chunks, device_span page_keys, device_span page_index, std::optional expected_row_count, @@ -1631,23 +1624,21 @@ void detect_malformed_pages(cudf::detail::hostdevice_vector& page } struct page_to_string_size { - gpu::PageInfo* pages; - gpu::ColumnChunkDesc const* chunks; + PageInfo* pages; + ColumnChunkDesc const* chunks; __device__ size_t operator()(size_type page_idx) const { auto const page = pages[page_idx]; auto const chunk = chunks[page.chunk_idx]; - if (not is_string_col(chunk) || (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) != 0) { - return 0; - } + if (not is_string_col(chunk) || (page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return 0; } return pages[page_idx].str_bytes; } }; struct page_offset_output_iter { - gpu::PageInfo* p; + PageInfo* p; size_type const* index; using value_type = size_type; @@ -1738,7 +1729,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; break; @@ -1749,7 +1740,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // generate string dict indices if necessary { - auto is_dict_chunk = [](gpu::ColumnChunkDesc const& chunk) { + auto is_dict_chunk = [](ColumnChunkDesc const& chunk) { return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0; }; @@ -1785,7 +1776,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re if (total_str_dict_indexes > 0) { chunks.host_to_device_async(_stream); - gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); + BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream); } } @@ -1800,14 +1791,14 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re // if: // - user has passed custom row bounds // - we will be doing a chunked read - gpu::ComputePageSizes(pages, - chunks, - 0, // 0-max size_t. process all possible rows - std::numeric_limits::max(), - true, // compute num_rows - chunk_read_limit > 0, // compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + 0, // 0-max size_t. process all possible rows + std::numeric_limits::max(), + true, // compute num_rows + chunk_read_limit > 0, // compute string sizes + _pass_itm_data->level_type_size, + _stream); // computes: // PageInfo::chunk_row (the absolute start row index) for all pages @@ -1836,7 +1827,7 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re _pass_itm_data->output_chunk_read_info = _output_chunk_read_limit > 0 ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream) - : std::vector{{skip_rows, num_rows}}; + : std::vector{{skip_rows, num_rows}}; } void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) @@ -1853,14 +1844,14 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds // is set (if the user has specified artificial bounds). if (uses_custom_row_bounds) { - gpu::ComputePageSizes(pages, - chunks, - skip_rows, - num_rows, - false, // num_rows is already computed - false, // no need to compute string sizes - _pass_itm_data->level_type_size, - _stream); + ComputePageSizes(pages, + chunks, + skip_rows, + num_rows, + false, // num_rows is already computed + false, // no need to compute string sizes + _pass_itm_data->level_type_size, + _stream); // print_pages(pages, _stream); } @@ -1879,7 +1870,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses cols = &out_buf.children; // if this has a list parent, we have to get column sizes from the - // data computed during gpu::ComputePageSizes + // data computed during ComputePageSizes if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) { has_lists = true; } @@ -2014,4 +2005,4 @@ std::vector reader::impl::calculate_page_string_offsets() return col_sizes; } -} // namespace cudf::io::detail::parquet +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 2545a074a38..799d6d9fd64 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -20,7 +20,7 @@ #include #include -namespace cudf::io::parquet::gpu { +namespace cudf::io::parquet::detail { template constexpr int rle_stream_required_run_buffer_size() @@ -362,4 +362,4 @@ struct rle_stream { } }; -} // namespace cudf::io::parquet::gpu +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index a124f352ee4..a021aa89714 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -56,10 +56,10 @@ namespace cudf { namespace io { -namespace detail { namespace parquet { -using namespace cudf::io::parquet; -using namespace cudf::io; +namespace detail { + +using namespace cudf::io::detail; struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, @@ -185,13 +185,13 @@ namespace { * @param compression The compression type * @return The supported Parquet compression */ -parquet::Compression to_parquet_compression(compression_type compression) +Compression to_parquet_compression(compression_type compression) { switch (compression) { case compression_type::AUTO: - case compression_type::SNAPPY: return parquet::Compression::SNAPPY; - case compression_type::ZSTD: return parquet::Compression::ZSTD; - case compression_type::NONE: return parquet::Compression::UNCOMPRESSED; + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::NONE: return Compression::UNCOMPRESSED; default: CUDF_FAIL("Unsupported compression type"); } } @@ -206,7 +206,7 @@ void update_chunk_encodings(std::vector& encodings, uint32_t enc_mask) { for (uint8_t enc = 0; enc < static_cast(Encoding::NUM_ENCODINGS); enc++) { auto const enc_enum = static_cast(enc); - if ((enc_mask & gpu::encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } + if ((enc_mask & encoding_to_mask(enc_enum)) != 0) { encodings.push_back(enc_enum); } } } @@ -761,11 +761,11 @@ struct parquet_column_view { std::vector const& schema_tree, rmm::cuda_stream_view stream); - [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; + [[nodiscard]] parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const; [[nodiscard]] column_view cudf_column_view() const { return cudf_col; } - [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; } - [[nodiscard]] parquet::ConvertedType converted_type() const { return schema_node.converted_type; } + [[nodiscard]] Type physical_type() const { return schema_node.type; } + [[nodiscard]] ConvertedType converted_type() const { return schema_node.converted_type; } std::vector const& get_path_in_schema() { return path_in_schema; } @@ -846,11 +846,11 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, uint16_t max_rep_level = 0; curr_schema_node = schema_node; while (curr_schema_node.parent_idx != -1) { - if (curr_schema_node.repetition_type == parquet::REPEATED or - curr_schema_node.repetition_type == parquet::OPTIONAL) { + if (curr_schema_node.repetition_type == REPEATED or + curr_schema_node.repetition_type == OPTIONAL) { ++max_def_level; } - if (curr_schema_node.repetition_type == parquet::REPEATED) { ++max_rep_level; } + if (curr_schema_node.repetition_type == REPEATED) { ++max_rep_level; } curr_schema_node = schema_tree[curr_schema_node.parent_idx]; } CUDF_EXPECTS(max_def_level < 256, "Definition levels above 255 are not supported"); @@ -897,9 +897,9 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node, } } -gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const +parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream_view) const { - auto desc = gpu::parquet_column_device_view{}; // Zero out all fields + auto desc = parquet_column_device_view{}; // Zero out all fields desc.stats_dtype = schema_node.stats_dtype; desc.ts_scale = schema_node.ts_scale; @@ -931,8 +931,8 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_s * @param fragment_size Number of rows per fragment * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, - device_span col_desc, +void init_row_group_fragments(cudf::detail::hostdevice_2dvector& frag, + device_span col_desc, host_span partitions, device_span part_frag_offset, uint32_t fragment_size, @@ -940,7 +940,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector frag, +void calculate_page_fragments(device_span frag, host_span frag_sizes, rmm::cuda_stream_view stream) { auto d_frag_sz = cudf::detail::make_device_uvector_async( frag_sizes, stream, rmm::mr::get_current_device_resource()); - gpu::CalculatePageFragments(frag, d_frag_sz, stream); + CalculatePageFragments(frag, d_frag_sz, stream); } /** @@ -972,13 +972,13 @@ void calculate_page_fragments(device_span frag, * @param stream CUDA stream used for device memory operations and kernel launches */ void gather_fragment_statistics(device_span frag_stats, - device_span frags, + device_span frags, bool int96_timestamps, rmm::cuda_stream_view stream) { rmm::device_uvector frag_stats_group(frag_stats.size(), stream); - gpu::InitFragmentStatistics(frag_stats_group, frags, stream); + InitFragmentStatistics(frag_stats_group, frags, stream); detail::calculate_group_statistics( frag_stats.data(), frag_stats_group.data(), frag_stats.size(), stream, int96_timestamps); stream.synchronize(); @@ -1008,8 +1008,8 @@ size_t max_compression_output_size(Compression codec, uint32_t compression_block return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); } -auto init_page_sizes(hostdevice_2dvector& chunks, - device_span col_desc, +auto init_page_sizes(hostdevice_2dvector& chunks, + device_span col_desc, uint32_t num_columns, size_t max_page_size_bytes, size_type max_page_size_rows, @@ -1021,19 +1021,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); // Calculate number of pages and store in respective chunks - gpu::InitEncoderPages(chunks, - {}, - {}, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); int num_pages = 0; @@ -1046,19 +1046,19 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Now that we know the number of pages, allocate an array to hold per page size and get it // populated cudf::detail::hostdevice_vector page_sizes(num_pages, stream); - gpu::InitEncoderPages(chunks, - {}, - page_sizes, - {}, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + page_sizes, + {}, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); page_sizes.device_to_host_sync(stream); // Get per-page max compressed size @@ -1072,26 +1072,26 @@ auto init_page_sizes(hostdevice_2dvector& chunks, comp_page_sizes.host_to_device_async(stream); // Use per-page max compressed size to calculate chunk.compressed_size - gpu::InitEncoderPages(chunks, - {}, - {}, - comp_page_sizes, - col_desc, - num_columns, - max_page_size_bytes, - max_page_size_rows, - page_alignment(compression_codec), - write_v2_headers, - nullptr, - nullptr, - stream); + InitEncoderPages(chunks, + {}, + {}, + comp_page_sizes, + col_desc, + num_columns, + max_page_size_bytes, + max_page_size_rows, + page_alignment(compression_codec), + write_v2_headers, + nullptr, + nullptr, + stream); chunks.device_to_host_sync(stream); return comp_page_sizes; } size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) { - if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; } + if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; } auto const ncomp_type = to_nvcomp_compression_type(compression); auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) @@ -1104,9 +1104,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) } std::pair>, std::vector>> -build_chunk_dictionaries(hostdevice_2dvector& chunks, - host_span col_desc, - device_2dspan frags, +build_chunk_dictionaries(hostdevice_2dvector& chunks, + host_span col_desc, + device_2dspan frags, Compression compression, dictionary_policy dict_policy, size_t max_dict_size, @@ -1130,7 +1130,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, } // Allocate slots for each chunk - std::vector> hash_maps_storage; + std::vector> hash_maps_storage; hash_maps_storage.reserve(h_chunks.size()); for (auto& chunk : h_chunks) { if (col_desc[chunk.col_desc_id].physical_type == Type::BOOLEAN || @@ -1149,8 +1149,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device_async(stream); - gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); - gpu::populate_chunk_hash_maps(frags, stream); + initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); + populate_chunk_hash_maps(frags, stream); chunks.device_to_host_sync(stream); @@ -1197,8 +1197,8 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, chunk.dict_index = inserted_dict_index.data(); } chunks.host_to_device_async(stream); - gpu::collect_map_entries(chunks.device_view().flat_view(), stream); - gpu::get_dictionary_indices(frags, stream); + collect_map_entries(chunks.device_view().flat_view(), stream); + get_dictionary_indices(frags, stream); return std::pair(std::move(dict_data), std::move(dict_index)); } @@ -1221,9 +1221,9 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, * @param write_v2_headers True if version 2 page headers are to be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void init_encoder_pages(hostdevice_2dvector& chunks, - device_span col_desc, - device_span pages, +void init_encoder_pages(hostdevice_2dvector& chunks, + device_span col_desc, + device_span pages, cudf::detail::hostdevice_vector& comp_page_sizes, statistics_chunk* page_stats, statistics_chunk* frag_stats, @@ -1286,8 +1286,8 @@ void init_encoder_pages(hostdevice_2dvector& chunks, * @param write_v2_headers True if V2 page headers should be written * @param stream CUDA stream used for device memory operations and kernel launches */ -void encode_pages(hostdevice_2dvector& chunks, - device_span pages, +void encode_pages(hostdevice_2dvector& chunks, + device_span pages, uint32_t pages_in_batch, uint32_t first_page_in_batch, uint32_t rowgroups_in_batch, @@ -1308,8 +1308,7 @@ void encode_pages(hostdevice_2dvector& chunks, ? device_span(page_stats + first_page_in_batch, pages_in_batch) : device_span(); - uint32_t max_comp_pages = - (compression != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0; + uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1319,9 +1318,9 @@ void encode_pages(hostdevice_2dvector& chunks, comp_res.end(), compression_result{0, compression_status::FAILURE}); - gpu::EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); + EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream); switch (compression) { - case parquet::Compression::SNAPPY: + case Compression::SNAPPY: if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { gpu_snap(comp_in, comp_out, comp_res, stream); } else { @@ -1329,7 +1328,7 @@ void encode_pages(hostdevice_2dvector& chunks, nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); } break; - case parquet::Compression::ZSTD: { + case Compression::ZSTD: { if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); reason) { CUDF_FAIL("Compression error: " + reason.value()); @@ -1338,7 +1337,7 @@ void encode_pages(hostdevice_2dvector& chunks, break; } - case parquet::Compression::UNCOMPRESSED: break; + case Compression::UNCOMPRESSED: break; default: CUDF_FAIL("invalid compression type"); } @@ -1378,7 +1377,7 @@ void encode_pages(hostdevice_2dvector& chunks, * @param column_index_truncate_length maximum length of min or max values in column index, in bytes * @return Computed buffer size needed to encode the column index */ -size_t column_index_buffer_size(gpu::EncColumnChunk* ck, int32_t column_index_truncate_length) +size_t column_index_buffer_size(EncColumnChunk* ck, int32_t column_index_truncate_length) { // encoding the column index for a given chunk requires: // each list (4 of them) requires 6 bytes of overhead @@ -1499,8 +1498,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, std::vector this_table_schema(schema_tree.begin(), schema_tree.end()); // Initialize column description - cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), - stream); + cudf::detail::hostdevice_vector col_desc(parquet_columns.size(), + stream); std::transform( parquet_columns.begin(), parquet_columns.end(), col_desc.host_ptr(), [&](auto const& pcol) { return pcol.get_device_view(stream); @@ -1576,7 +1575,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto d_part_frag_offset = cudf::detail::make_device_uvector_async( part_frag_offset, stream, rmm::mr::get_current_device_resource()); - cudf::detail::hostdevice_2dvector row_group_fragments( + cudf::detail::hostdevice_2dvector row_group_fragments( num_columns, num_fragments, stream); // Create table_device_view so that corresponding column_device_view data @@ -1588,7 +1587,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (num_fragments != 0) { // Move column info to device col_desc.host_to_device_async(stream); - leaf_column_views = create_leaf_column_device_views( + leaf_column_views = create_leaf_column_device_views( col_desc, *parent_column_table_device_view, stream); init_row_group_fragments(row_group_fragments, @@ -1662,7 +1661,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // Initialize row groups and column chunks auto const num_chunks = num_rowgroups * num_columns; - hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); + hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); // total fragments per column (in case they are non-uniform) std::vector frags_per_column(num_columns, 0); @@ -1678,7 +1677,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, row_group.total_byte_size = 0; row_group.columns.resize(num_columns); for (int c = 0; c < num_columns; c++) { - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; ck = {}; ck.col_desc = col_desc.device_ptr() + c; @@ -1700,7 +1699,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, return l + r.num_values; }); ck.plain_data_size = std::accumulate( - chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { + chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, PageFragment frag) { return sum + frag.fragment_data_size; }); auto& column_chunk_meta = row_group.columns[c].meta_data; @@ -1731,7 +1730,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, frags_per_column.empty() ? 0 : frag_offsets.back() + frags_per_column.back(); rmm::device_uvector frag_stats(0, stream); - cudf::detail::hostdevice_vector page_fragments(total_frags, stream); + cudf::detail::hostdevice_vector page_fragments(total_frags, stream); // update fragments and/or prepare for fragment statistics calculation if necessary if (total_frags != 0) { @@ -1749,9 +1748,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const& row_group = agg_meta->file(p).row_groups[global_r]; uint32_t const fragments_in_chunk = util::div_rounding_up_unsafe(row_group.num_rows, frag_size); - gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; - ck.fragments = page_fragments.device_ptr(frag_offset); - ck.first_fragment = frag_offset; + EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + ck.fragments = page_fragments.device_ptr(frag_offset); + ck.first_fragment = frag_offset; // update the chunk pointer here for each fragment in chunk.fragments for (uint32_t i = 0; i < fragments_in_chunk; i++) { @@ -1817,8 +1816,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_t comp_rowgroup_size = 0; if (r < num_rowgroups) { for (int i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; - ck->first_page = num_pages; + EncColumnChunk* ck = &chunks[r][i]; + ck->first_page = num_pages; num_pages += ck->num_pages; pages_in_batch += ck->num_pages; rowgroup_size += ck->bfr_size; @@ -1850,7 +1849,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } // Clear compressed buffer size if compression has been turned off - if (compression == parquet::Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } + if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } // Initialize data pointers in batch uint32_t const num_stats_bfr = @@ -1864,7 +1863,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, stream); rmm::device_buffer col_idx_bfr(column_index_bfr_size, stream); - rmm::device_uvector pages(num_pages, stream); + rmm::device_uvector pages(num_pages, stream); // This contains stats for both the pages and the rowgroups. TODO: make them separate. rmm::device_uvector page_stats(num_stats_bfr, stream); @@ -1874,10 +1873,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto bfr_c = static_cast(comp_bfr.data()); for (auto j = 0; j < batch_list[b]; j++, r++) { for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk& ck = chunks[r][i]; - ck.uncompressed_bfr = bfr; - ck.compressed_bfr = bfr_c; - ck.column_index_blob = bfr_i; + EncColumnChunk& ck = chunks[r][i]; + ck.uncompressed_bfr = bfr; + ck.compressed_bfr = bfr_c; + ck.column_index_blob = bfr_i; bfr += ck.bfr_size; bfr_c += ck.compressed_size; if (stats_granularity == statistics_freq::STATISTICS_COLUMN) { @@ -1960,7 +1959,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, if (ck.ck_stat_size != 0) { std::vector const stats_blob = cudf::detail::make_std_vector_sync( device_span(dev_bfr, ck.ck_stat_size), stream); - cudf::io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); + CompactProtocolReader cp(stats_blob.data(), stats_blob.size()); cp.read(&column_chunk_meta.statistics); need_sync = true; } @@ -2142,8 +2141,8 @@ void writer::impl::write(table_view const& input, std::vector co void writer::impl::write_parquet_data_to_sink( std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -2209,7 +2208,7 @@ void writer::impl::write_parquet_data_to_sink( int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; auto const& row_group = _agg_meta->file(p).row_groups[global_r]; for (std::size_t i = 0; i < num_columns; i++) { - gpu::EncColumnChunk const& ck = chunks[r][i]; + EncColumnChunk const& ck = chunks[r][i]; auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index @@ -2392,7 +2391,7 @@ std::unique_ptr> writer::merge_row_group_metadata( return std::make_unique>(std::move(output)); } -} // namespace parquet } // namespace detail +} // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 89ef85ba2bd..e0f38ed362c 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -40,13 +40,12 @@ namespace cudf { namespace io { -namespace detail { namespace parquet { +namespace detail { + // Forward internal classes struct aggregate_writer_metadata; -using namespace cudf::io::parquet; -using namespace cudf::io; using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; using cudf::detail::hostdevice_2dvector; @@ -66,7 +65,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -79,7 +78,7 @@ class writer::impl { */ explicit impl(std::vector> sinks, chunked_parquet_writer_options const& options, - single_write_mode mode, + cudf::io::detail::single_write_mode mode, rmm::cuda_stream_view stream); /** @@ -139,8 +138,8 @@ class writer::impl { * @param[out] bounce_buffer Temporary host output buffer */ void write_parquet_data_to_sink(std::unique_ptr& updated_agg_meta, - device_span pages, - host_2dspan chunks, + device_span pages, + host_2dspan chunks, host_span global_rowgroup_base, host_span first_rg_in_part, host_span batch_list, @@ -164,9 +163,10 @@ class writer::impl { bool const _write_v2_headers; int32_t const _column_index_truncate_length; std::vector> const _kv_meta; // Optional user metadata. - single_write_mode const _single_write_mode; // Special parameter only used by `write()` to - // indicate that we are guaranteeing a single table - // write. This enables some internal optimizations. + cudf::io::detail::single_write_mode const + _single_write_mode; // Special parameter only used by `write()` to + // indicate that we are guaranteeing a single table + // write. This enables some internal optimizations. std::vector> const _out_sink; // Internal states, filled during `write()` and written to sink during `write` and `close()`. @@ -180,7 +180,7 @@ class writer::impl { bool _closed = false; // To track if the output has been written to sink. }; -} // namespace parquet } // namespace detail +} // namespace parquet } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index f3a43cbc63c..dd049d401cf 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -51,19 +51,21 @@ std::unique_ptr gather_column_buffer::make_string_column_impl(rmm::cuda_ return make_strings_column(*_strings, stream, _mr); } -void inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) +void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream) { CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column"); // size + 1 for final offset. _string_data will be initialized later. _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr); } -void inline_column_buffer::create_string_data(size_t num_bytes, rmm::cuda_stream_view stream) +void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes, + rmm::cuda_stream_view stream) { _string_data = rmm::device_buffer(num_bytes, stream, _mr); } -std::unique_ptr inline_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream) +std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_column_impl( + rmm::cuda_stream_view stream) { // no need for copies, just transfer ownership of the data_buffers to the columns auto const state = mask_state::UNALLOCATED; @@ -324,7 +326,7 @@ std::unique_ptr empty_like(column_buffer_base& buffer, } using pointer_type = gather_column_buffer; -using string_type = inline_column_buffer; +using string_type = cudf::io::detail::inline_column_buffer; using pointer_column_buffer = column_buffer_base; using string_column_buffer = column_buffer_base; diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 81e0e12eeb9..217bb891a2b 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -200,29 +200,30 @@ std::unique_ptr make_parquet_list_list_col( // of the file to populate the FileMetaData pointed to by file_meta_data. // throws cudf::logic_error if the file or metadata is invalid. void read_footer(std::unique_ptr const& source, - cudf::io::parquet::FileMetaData* file_meta_data) + cudf::io::parquet::detail::FileMetaData* file_meta_data) { - constexpr auto header_len = sizeof(cudf::io::parquet::file_header_s); - constexpr auto ender_len = sizeof(cudf::io::parquet::file_ender_s); + constexpr auto header_len = sizeof(cudf::io::parquet::detail::file_header_s); + constexpr auto ender_len = sizeof(cudf::io::parquet::detail::file_ender_s); auto const len = source->size(); auto const header_buffer = source->host_read(0, header_len); auto const header = - reinterpret_cast(header_buffer->data()); + reinterpret_cast(header_buffer->data()); auto const ender_buffer = source->host_read(len - ender_len, ender_len); - auto const ender = reinterpret_cast(ender_buffer->data()); + auto const ender = + reinterpret_cast(ender_buffer->data()); // checks for valid header, footer, and file length ASSERT_GT(len, header_len + ender_len); - ASSERT_TRUE(header->magic == cudf::io::parquet::parquet_magic && - ender->magic == cudf::io::parquet::parquet_magic); + ASSERT_TRUE(header->magic == cudf::io::parquet::detail::parquet_magic && + ender->magic == cudf::io::parquet::detail::parquet_magic); ASSERT_TRUE(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len)); // parquet files end with 4-byte footer_length and 4-byte magic == "PAR1" // seek backwards from the end of the file (footer_length + 8 bytes of ender) auto const footer_buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len); - cudf::io::parquet::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); + cudf::io::parquet::detail::CompactProtocolReader cp(footer_buffer->data(), ender->footer_len); // returns true on success bool res = cp.read(file_meta_data); @@ -233,14 +234,14 @@ void read_footer(std::unique_ptr const& source, // this assumes the data is uncompressed. // throws cudf::logic_error if the page_loc data is invalid. int read_dict_bits(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); @@ -252,15 +253,16 @@ int read_dict_bits(std::unique_ptr const& source, // read column index from datasource at location indicated by chunk, // parse and return as a ColumnIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::ColumnIndex read_column_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::ColumnIndex read_column_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.column_index_offset > 0, "Cannot find column index"); CUDF_EXPECTS(chunk.column_index_length > 0, "Invalid column index length"); - cudf::io::parquet::ColumnIndex colidx; + cudf::io::parquet::detail::ColumnIndex colidx; auto const ci_buf = source->host_read(chunk.column_index_offset, chunk.column_index_length); - cudf::io::parquet::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(ci_buf->data(), ci_buf->size()); bool res = cp.read(&colidx); CUDF_EXPECTS(res, "Cannot parse column index"); return colidx; @@ -269,22 +271,24 @@ cudf::io::parquet::ColumnIndex read_column_index( // read offset index from datasource at location indicated by chunk, // parse and return as an OffsetIndex struct. // throws cudf::logic_error if the chunk data is invalid. -cudf::io::parquet::OffsetIndex read_offset_index( - std::unique_ptr const& source, cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::OffsetIndex read_offset_index( + std::unique_ptr const& source, + cudf::io::parquet::detail::ColumnChunk const& chunk) { CUDF_EXPECTS(chunk.offset_index_offset > 0, "Cannot find offset index"); CUDF_EXPECTS(chunk.offset_index_length > 0, "Invalid offset index length"); - cudf::io::parquet::OffsetIndex offidx; + cudf::io::parquet::detail::OffsetIndex offidx; auto const oi_buf = source->host_read(chunk.offset_index_offset, chunk.offset_index_length); - cudf::io::parquet::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(oi_buf->data(), oi_buf->size()); bool res = cp.read(&offidx); CUDF_EXPECTS(res, "Cannot parse offset index"); return offidx; } // Return as a Statistics from the column chunk -cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChunk const& chunk) +cudf::io::parquet::detail::Statistics const& get_statistics( + cudf::io::parquet::detail::ColumnChunk const& chunk) { return chunk.meta_data.statistics; } @@ -292,15 +296,16 @@ cudf::io::parquet::Statistics const& get_statistics(cudf::io::parquet::ColumnChu // read page header from datasource at location indicated by page_loc, // parse and return as a PageHeader struct. // throws cudf::logic_error if the page_loc data is invalid. -cudf::io::parquet::PageHeader read_page_header(std::unique_ptr const& source, - cudf::io::parquet::PageLocation const& page_loc) +cudf::io::parquet::detail::PageHeader read_page_header( + std::unique_ptr const& source, + cudf::io::parquet::detail::PageLocation const& page_loc) { CUDF_EXPECTS(page_loc.offset > 0, "Cannot find page header"); CUDF_EXPECTS(page_loc.compressed_page_size > 0, "Invalid page header length"); - cudf::io::parquet::PageHeader page_hdr; + cudf::io::parquet::detail::PageHeader page_hdr; auto const page_buf = source->host_read(page_loc.offset, page_loc.compressed_page_size); - cudf::io::parquet::CompactProtocolReader cp(page_buf->data(), page_buf->size()); + cudf::io::parquet::detail::CompactProtocolReader cp(page_buf->data(), page_buf->size()); bool res = cp.read(&page_hdr); CUDF_EXPECTS(res, "Cannot parse page header"); return page_hdr; @@ -3686,7 +3691,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3697,7 +3702,7 @@ TEST_F(ParquetWriterTest, CheckPageRows) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_EQ(ph.data_page_header.num_values, page_rows); } @@ -3722,7 +3727,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // check first page header and make sure it has only page_rows values auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -3733,7 +3738,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsAdjusted) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); EXPECT_LE(ph.data_page_header.num_values, rows_per_page); } @@ -3759,7 +3764,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // check that file is written correctly when rows/page < fragment size auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_TRUE(fmd.row_groups.size() > 0); @@ -3770,7 +3775,7 @@ TEST_F(ParquetWriterTest, CheckPageRowsTooSmall) // read first data page header. sizeof(PageHeader) is not exact, but the thrift encoded // version should be smaller than size of the struct. auto const ph = read_page_header( - source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::PageHeader), 0}); + source, {first_chunk.data_page_offset, sizeof(cudf::io::parquet::detail::PageHeader), 0}); // there should be only one page since the fragment size is larger than rows_per_page EXPECT_EQ(ph.data_page_header.num_values, num_rows); @@ -3798,7 +3803,7 @@ TEST_F(ParquetWriterTest, Decimal128Stats) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4031,7 +4036,7 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); ASSERT_GT(fmd.row_groups.size(), 0); @@ -4041,10 +4046,10 @@ TYPED_TEST(ParquetWriterComparableTypeTest, ThreeColumnSorted) // now check that the boundary order for chunk 1 is ascending, // chunk 2 is descending, and chunk 3 is unordered - cudf::io::parquet::BoundaryOrder expected_orders[] = { - cudf::io::parquet::BoundaryOrder::ASCENDING, - cudf::io::parquet::BoundaryOrder::DESCENDING, - cudf::io::parquet::BoundaryOrder::UNORDERED}; + cudf::io::parquet::detail::BoundaryOrder expected_orders[] = { + cudf::io::parquet::detail::BoundaryOrder::ASCENDING, + cudf::io::parquet::detail::BoundaryOrder::DESCENDING, + cudf::io::parquet::detail::BoundaryOrder::UNORDERED}; for (std::size_t i = 0; i < columns.size(); i++) { auto const ci = read_column_index(source, columns[i]); @@ -4067,15 +4072,15 @@ int32_t compare(T& v1, T& v2) // 1 if v1 > v2. int32_t compare_binary(std::vector const& v1, std::vector const& v2, - cudf::io::parquet::Type ptype, - cudf::io::parquet::ConvertedType ctype) + cudf::io::parquet::detail::Type ptype, + cudf::io::parquet::detail::ConvertedType ctype) { switch (ptype) { - case cudf::io::parquet::INT32: + case cudf::io::parquet::detail::INT32: switch (ctype) { - case cudf::io::parquet::UINT_8: - case cudf::io::parquet::UINT_16: - case cudf::io::parquet::UINT_32: + case cudf::io::parquet::detail::UINT_8: + case cudf::io::parquet::detail::UINT_16: + case cudf::io::parquet::detail::UINT_32: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); default: @@ -4083,23 +4088,23 @@ int32_t compare_binary(std::vector const& v1, *(reinterpret_cast(v2.data()))); } - case cudf::io::parquet::INT64: - if (ctype == cudf::io::parquet::UINT_64) { + case cudf::io::parquet::detail::INT64: + if (ctype == cudf::io::parquet::detail::UINT_64) { return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); } return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::FLOAT: + case cudf::io::parquet::detail::FLOAT: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::DOUBLE: + case cudf::io::parquet::detail::DOUBLE: return compare(*(reinterpret_cast(v1.data())), *(reinterpret_cast(v2.data()))); - case cudf::io::parquet::BYTE_ARRAY: { + case cudf::io::parquet::detail::BYTE_ARRAY: { int32_t v1sz = v1.size(); int32_t v2sz = v2.size(); int32_t ret = memcmp(v1.data(), v2.data(), std::min(v1sz, v2sz)); @@ -4142,7 +4147,7 @@ TEST_P(ParquetV2Test, LargeColumnIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4164,10 +4169,10 @@ TEST_P(ParquetV2Test, LargeColumnIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndex) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4210,7 +4215,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4255,10 +4260,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4311,7 +4316,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4362,10 +4367,10 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls) TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) { - constexpr auto num_rows = 100000; - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + constexpr auto num_rows = 100000; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; // fixed length strings auto str1_elements = cudf::detail::make_counting_transform_iterator(0, [](auto i) { @@ -4403,7 +4408,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4458,9 +4463,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto c0 = testdata::ascending(); @@ -4495,7 +4500,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4542,9 +4547,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct) TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; auto validity2 = cudf::detail::make_counting_transform_iterator(0, [](cudf::size_type i) { return i % 2; }); @@ -4586,7 +4591,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4616,9 +4621,9 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls) TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) { - auto const is_v2 = GetParam(); - auto const expected_hdr_type = - is_v2 ? cudf::io::parquet::PageType::DATA_PAGE_V2 : cudf::io::parquet::PageType::DATA_PAGE; + auto const is_v2 = GetParam(); + auto const expected_hdr_type = is_v2 ? cudf::io::parquet::detail::PageType::DATA_PAGE_V2 + : cudf::io::parquet::detail::PageType::DATA_PAGE; using cudf::test::iterators::null_at; using cudf::test::iterators::nulls_at; @@ -4711,7 +4716,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4812,7 +4817,7 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -4870,7 +4875,7 @@ TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) cudf::io::write_parquet(out_opts); auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); @@ -5030,10 +5035,10 @@ TEST_F(ParquetReaderTest, NestedByteArray) cudf::io::write_parquet(out_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); std::vector md{ {}, @@ -5081,12 +5086,12 @@ TEST_F(ParquetWriterTest, ByteArrayStats) auto result = cudf::io::read_parquet(in_opts); auto source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); - EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::Type::BYTE_ARRAY); - EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[1].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); + EXPECT_EQ(fmd.schema[2].type, cudf::io::parquet::detail::Type::BYTE_ARRAY); auto const stats0 = get_statistics(fmd.row_groups[0].columns[0]); auto const stats1 = get_statistics(fmd.row_groups[0].columns[1]); @@ -5137,9 +5142,9 @@ TEST_F(ParquetReaderTest, StructByteArray) TEST_F(ParquetReaderTest, NestingOptimizationTest) { - // test nesting levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info deep. + // test nesting levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info deep. constexpr cudf::size_type num_nesting_levels = 16; - static_assert(num_nesting_levels > cudf::io::parquet::gpu::max_cacheable_nesting_decode_info); + static_assert(num_nesting_levels > cudf::io::parquet::detail::max_cacheable_nesting_decode_info); constexpr cudf::size_type rows_per_level = 2; constexpr cudf::size_type num_values = (1 << num_nesting_levels) * rows_per_level; @@ -5206,13 +5211,13 @@ TEST_F(ParquetWriterTest, SingleValueDictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5252,13 +5257,13 @@ TEST_F(ParquetWriterTest, DictionaryNeverTest) // make sure dictionary was not used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5303,13 +5308,13 @@ TEST_F(ParquetWriterTest, DictionaryAdaptiveTest) // make sure dictionary was used as expected. col0 should use one, // col1 should not. auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5354,13 +5359,13 @@ TEST_F(ParquetWriterTest, DictionaryAlwaysTest) // make sure dictionary was used for both columns auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd](int col) { for (auto enc : fmd.row_groups[0].columns[col].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -5438,13 +5443,13 @@ TEST_P(ParquetSizedTest, DictionaryTest) // make sure dictionary was used auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto used_dict = [&fmd]() { for (auto enc : fmd.row_groups[0].columns[0].meta_data.encodings) { - if (enc == cudf::io::parquet::Encoding::PLAIN_DICTIONARY or - enc == cudf::io::parquet::Encoding::RLE_DICTIONARY) { + if (enc == cudf::io::parquet::detail::Encoding::PLAIN_DICTIONARY or + enc == cudf::io::parquet::detail::Encoding::RLE_DICTIONARY) { return true; } } @@ -6664,7 +6669,7 @@ TEST_F(ParquetWriterTest, PreserveNullability) TEST_P(ParquetV2Test, CheckEncodings) { - using cudf::io::parquet::Encoding; + using cudf::io::parquet::detail::Encoding; constexpr auto num_rows = 100'000; auto const is_v2 = GetParam(); @@ -6697,7 +6702,7 @@ TEST_P(ParquetV2Test, CheckEncodings) }; auto const source = cudf::io::datasource::create(filepath); - cudf::io::parquet::FileMetaData fmd; + cudf::io::parquet::detail::FileMetaData fmd; read_footer(source, &fmd); auto const& chunk0_enc = fmd.row_groups[0].columns[0].meta_data.encodings; From cb74b7e1c9307eb262ece26a626763d504a9d577 Mon Sep 17 00:00:00 2001 From: db Date: Fri, 6 Oct 2023 11:16:52 -0500 Subject: [PATCH 2/6] Remove reader_impl_chunking.cu, which was accidentally included. --- cpp/src/io/parquet/reader_impl_chunking.cu | 597 --------------------- 1 file changed, 597 deletions(-) delete mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu deleted file mode 100644 index 2c1521e46db..00000000000 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ /dev/null @@ -1,597 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "reader_impl.hpp" - -#include -#include - -#include - -#include - -#include -#include -#include -#include - -namespace cudf::io::parquet::detail { - -namespace { - -struct cumulative_row_info { - size_t row_count; // cumulative row count - size_t size_bytes; // cumulative size in bytes - int key; // schema index -}; - -#if defined(PREPROCESS_DEBUG) -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, - rmm::device_uvector const& page_index, - rmm::device_uvector const& c_info, - rmm::cuda_stream_view stream) -{ - pages.device_to_host_sync(stream); - - printf("------------\nCumulative sizes by page\n"); - - std::vector schemas(pages.size()); - std::vector h_page_index(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); - std::vector h_cinfo(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); - auto schema_iter = cudf::detail::make_counting_transform_iterator( - 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); - thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); - auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); - schemas.resize(last - schemas.begin()); - printf("Num schemas: %lu\n", schemas.size()); - - for (size_t idx = 0; idx < schemas.size(); idx++) { - printf("Schema %d\n", schemas[idx]); - for (size_t pidx = 0; pidx < pages.size(); pidx++) { - auto const& page = pages[h_page_index[pidx]]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { - continue; - } - printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); - } - } -} - -void print_cumulative_row_info( - host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) -{ - if (splits.has_value()) { - printf("------------\nSplits\n"); - for (size_t idx = 0; idx < splits->size(); idx++) { - printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); - } - } - - printf("------------\nCumulative sizes %s\n", label.c_str()); - for (size_t idx = 0; idx < sizes.size(); idx++) { - printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); - if (splits.has_value()) { - // if we have a split at this row count and this is the last instance of this row count - auto start = thrust::make_transform_iterator( - splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; }); - auto end = start + splits->size(); - auto split = std::find(start, end, sizes[idx].row_count); - auto const split_index = [&]() -> int { - if (split != end && - ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { - return static_cast(std::distance(start, split)); - } - return idx == 0 ? 0 : -1; - }(); - if (split_index >= 0) { - printf(" <-- split {%lu, %lu}", - splits.value()[split_index].skip_rows, - splits.value()[split_index].num_rows); - } - } - printf("\n"); - } -} -#endif // PREPROCESS_DEBUG - -/** - * @brief Functor which reduces two cumulative_row_info structs of the same key. - */ -struct cumulative_row_sum { - cumulative_row_info operator() - __device__(cumulative_row_info const& a, cumulative_row_info const& b) const - { - return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; - } -}; - -/** - * @brief Functor which computes the total data size for a given type of cudf column. - * - * In the case of strings, the return size does not include the chars themselves. That - * information is tracked separately (see PageInfo::str_bytes). - */ -struct row_size_functor { - __device__ size_t validity_size(size_t num_rows, bool nullable) - { - return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; - } - - template - __device__ size_t operator()(size_t num_rows, bool nullable) - { - auto const element_size = sizeof(device_storage_type_t); - return (element_size * num_rows) + validity_size(num_rows, nullable); - } -}; - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - auto const offset_size = sizeof(size_type); - // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset - // for the entire column, whereas this is adding an extra offset per page. So we will get a - // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better - // to overestimate size somewhat than to underestimate it and potentially generate chunks - // that are too large. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - return validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - // only returns the size of offsets and validity. the size of the actual string chars - // is tracked separately. - auto const offset_size = sizeof(size_type); - // see note about offsets in the list_view template. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -/** - * @brief Functor which computes the total output cudf data size for all of - * the data in this page. - * - * Sums across all nesting levels. - */ -struct get_cumulative_row_info { - gpu::PageInfo const* const pages; - - __device__ cumulative_row_info operator()(size_type index) - { - auto const& page = pages[index]; - if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { - return cumulative_row_info{0, 0, page.src_col_schema}; - } - - // total nested size, not counting string data - auto iter = - cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { - auto const& pni = page.nesting[i]; - return cudf::type_dispatcher( - data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); - }); - - size_t const row_count = static_cast(page.nesting[0].size); - return { - row_count, - thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, - page.src_col_schema}; - } -}; - -/** - * @brief Functor which computes the effective size of all input columns by page. - * - * For a given row, we want to find the cost of all pages for all columns involved - * in loading up to that row. The complication here is that not all pages are the - * same size between columns. Example: - * - * page row counts - * Column A: 0 <----> 100 <----> 200 - * Column B: 0 <---------------> 200 <--------> 400 - | - * if we decide to split at row 100, we don't really know the actual amount of bytes in column B - * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - * page. Essentially, a conservative over-estimate of the real size. - */ -struct row_total_size { - cumulative_row_info const* c_info; - size_type const* key_offsets; - size_t num_keys; - - __device__ cumulative_row_info operator()(cumulative_row_info const& i) - { - // sum sizes for each input column at this row - size_t sum = 0; - for (int idx = 0; idx < num_keys; idx++) { - auto const start = key_offsets[idx]; - auto const end = key_offsets[idx + 1]; - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [&] __device__(size_type i) { return c_info[i].row_count; }); - auto const page_index = - thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; - sum += c_info[page_index].size_bytes; - } - return {i.row_count, sum, i.key}; - } -}; - -/** - * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read - * limit, determine the set of splits. - * - * @param sizes Vector of cumulative {row_count, byte_size} pairs - * @param num_rows Total number of rows to read - * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns - */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) -{ - // now we have an array of {row_count, real output bytes}. just walk through it and generate - // splits. - // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch - // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; - { - size_t cur_pos = 0; - size_t cur_cumulative_size = 0; - size_t cur_row_count = 0; - auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { - return i.size_bytes - cur_cumulative_size; - }); - auto end = start + sizes.size(); - while (cur_row_count < num_rows) { - int64_t split_pos = - thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; - - // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back - // one. - if (static_cast(split_pos) >= sizes.size() || - (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { - split_pos--; - } - - // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in - // a loop because all of the cumulative sizes for all the pages are sorted into one big list. - // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in - // the list twice. so we have to iterate until we skip past all of them. The idea is that we - // either do this, or we have to call unique() on the input first. - while (split_pos < (static_cast(sizes.size()) - 1) && - (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { - split_pos++; - } - - auto const start_row = cur_row_count; - cur_row_count = sizes[split_pos].row_count; - splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row}); - cur_pos = split_pos; - cur_cumulative_size = sizes[split_pos].size_bytes; - } - } - // print_cumulative_row_info(sizes, "adjusted", splits); - - return splits; -} - -/** - * @brief Converts cuDF units to Parquet units. - * - * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. - */ -[[nodiscard]] std::tuple conversion_info(type_id column_type_id, - type_id timestamp_type_id, - parquet::Type physical, - int8_t converted, - int32_t length) -{ - int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0; - int32_t clock_rate = 0; - if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { - type_width = 1; // I32 -> I8 - } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { - type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::INT32) { - type_width = 4; // str -> hash32 - } else if (is_chrono(data_type{column_type_id})) { - clock_rate = to_clockrate(timestamp_type_id); - } - - int8_t converted_type = converted; - if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 && - not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = parquet::UNKNOWN; // Not converting to float64 or decimal - } - return std::make_tuple(type_width, clock_rate, converted_type); -} - -/** - * @brief Return the required number of bits to store a value. - */ -template -[[nodiscard]] T required_bits(uint32_t max_level) -{ - return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); -} - -struct row_count_compare { - __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) - { - return a.row_count < b.row_count; - } -}; - -} // anonymous namespace - -void reader::impl::create_global_chunk_info() -{ - auto const num_rows = _file_itm_data.global_num_rows; - auto const& row_groups_info = _file_itm_data.row_groups; - auto& chunks = _file_itm_data.chunks; - - // Descriptors for all the chunks that make up the selected columns - auto const num_input_columns = _input_columns.size(); - auto const num_chunks = row_groups_info.size() * num_input_columns; - - // Initialize column chunk information - auto remaining_rows = num_rows; - for (auto const& rg : row_groups_info) { - auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); - auto const row_group_start = rg.start_row; - auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); - - // generate ColumnChunkDesc objects for everything to be decoded (all input columns) - for (size_t i = 0; i < num_input_columns; ++i) { - auto col = _input_columns[i]; - // look up metadata - auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); - - auto [type_width, clock_rate, converted_type] = - conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), - _timestamp_type.id(), - schema.type, - schema.converted_type, - schema.type_length); - - chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); - } - - remaining_rows -= row_group_rows; - } -} - -void reader::impl::compute_input_passes() -{ - // at this point, row_groups has already been filtered down to just the row groups we need to - // handle optional skip_rows/num_rows parameters. - auto const& row_groups_info = _file_itm_data.row_groups; - - // if the user hasn't specified an input size limit, read everything in a single pass. - if (_input_pass_read_limit == 0) { - _file_itm_data.input_pass_row_group_offsets.push_back(0); - _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); - return; - } - - // generate passes. make sure to account for the case where a single row group doesn't fit within - // - std::size_t const read_limit = - _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); - std::size_t cur_pass_byte_size = 0; - std::size_t cur_rg_start = 0; - std::size_t cur_row_count = 0; - _file_itm_data.input_pass_row_group_offsets.push_back(0); - _file_itm_data.input_pass_row_count.push_back(0); - - for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { - auto const& rgi = row_groups_info[cur_rg_index]; - auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); - - // can we add this row group - if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { - // A single row group (the current one) is larger than the read limit: - // We always need to include at least one row group, so end the pass at the end of the current - // row group - if (cur_rg_start == cur_rg_index) { - _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1); - _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows); - cur_rg_start = cur_rg_index + 1; - cur_pass_byte_size = 0; - } - // End the pass at the end of the previous row group - else { - _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index); - _file_itm_data.input_pass_row_count.push_back(cur_row_count); - cur_rg_start = cur_rg_index; - cur_pass_byte_size = row_group.total_byte_size; - } - } else { - cur_pass_byte_size += row_group.total_byte_size; - } - cur_row_count += row_group.num_rows; - } - // add the last pass if necessary - if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) { - _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); - _file_itm_data.input_pass_row_count.push_back(cur_row_count); - } -} - -void reader::impl::setup_next_pass() -{ - // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); - - // setup row groups to be loaded for this pass - auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass]; - auto const row_group_end = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1]; - auto const num_row_groups = row_group_end - row_group_start; - _pass_itm_data->row_groups.resize(num_row_groups); - std::copy(_file_itm_data.row_groups.begin() + row_group_start, - _file_itm_data.row_groups.begin() + row_group_end, - _pass_itm_data->row_groups.begin()); - - auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; - CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); - - auto const chunks_per_rowgroup = _input_columns.size(); - auto const num_chunks = chunks_per_rowgroup * num_row_groups; - - auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); - auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - - _pass_itm_data->chunks = - cudf::detail::hostdevice_vector(num_chunks, _stream); - std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); - - // adjust skip_rows and num_rows by what's available in the row groups we are processing - if (num_passes == 1) { - _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; - _pass_itm_data->num_rows = _file_itm_data.global_num_rows; - } else { - auto const global_start_row = _file_itm_data.global_skip_rows; - auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; - auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row); - auto const end_row = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row); - - // skip_rows is always global in the sense that it is relative to the first row of - // everything we will be reading, regardless of what pass we are on. - // num_rows is how many rows we are reading this pass. - _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass]; - _pass_itm_data->num_rows = end_row - start_row; - } -} - -void reader::impl::compute_splits_for_pass() -{ - auto const skip_rows = _pass_itm_data->skip_rows; - auto const num_rows = _pass_itm_data->num_rows; - - // simple case : no chunk size, no splits - if(_output_chunk_read_limit <= 0){ - _pass_itm_data->output_chunk_read_info = std::vector{{skip_rows, num_rows}}; - return; - } - - auto& pages = _pass_itm_data->pages_info; - - auto const& page_keys = _pass_itm_data->page_keys; - auto const& page_index = _pass_itm_data->page_index; - - // generate cumulative row counts and sizes - rmm::device_uvector c_info(page_keys.size(), _stream); - // convert PageInfo to cumulative_row_info - auto page_input = thrust::make_transform_iterator(page_index.begin(), - get_cumulative_row_info{pages.device_ptr()}); - thrust::inclusive_scan_by_key(rmm::exec_policy(_stream), - page_keys.begin(), - page_keys.end(), - page_input, - c_info.begin(), - thrust::equal_to{}, - cumulative_row_sum{}); - // print_cumulative_page_info(pages, page_index, c_info, stream); - - // sort by row count - rmm::device_uvector c_info_sorted{c_info, _stream}; - thrust::sort(rmm::exec_policy(_stream), - c_info_sorted.begin(), - c_info_sorted.end(), - row_count_compare{}); - - // std::vector h_c_info_sorted(c_info_sorted.size()); - // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), - // c_info_sorted.data(), - // sizeof(cumulative_row_info) * c_info_sorted.size(), - // cudaMemcpyDefault)); - // print_cumulative_row_info(h_c_info_sorted, "raw"); - - // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per - // key - rmm::device_uvector key_offsets(page_keys.size() + 1, _stream); - auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream), - page_keys.begin(), - page_keys.end(), - thrust::make_constant_iterator(1), - thrust::make_discard_iterator(), - key_offsets.begin()) - .second; - size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); - thrust::exclusive_scan( - rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); - - // adjust the cumulative info such that for each row count, the size includes any pages that span - // that row count. this is so that if we have this case: - // page row counts - // Column A: 0 <----> 100 <----> 200 - // Column B: 0 <---------------> 200 <--------> 400 - // | - // if we decide to split at row 100, we don't really know the actual amount of bytes in column B - // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - // page. - // - rmm::device_uvector aggregated_info(c_info.size(), _stream); - thrust::transform(rmm::exec_policy(_stream), - c_info_sorted.begin(), - c_info_sorted.end(), - aggregated_info.begin(), - row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); - - // bring back to the cpu - std::vector h_aggregated_info(aggregated_info.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), - aggregated_info.data(), - sizeof(cumulative_row_info) * c_info.size(), - cudaMemcpyDefault, - _stream.value())); - _stream.synchronize(); - - // generate the actual splits - _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit); -} - -} // namespace cudf::io::parquet::detail From 227e1f08533ac0bab256d3a5f26a42e9fc0db11f Mon Sep 17 00:00:00 2001 From: db Date: Fri, 6 Oct 2023 16:37:03 -0500 Subject: [PATCH 3/6] Centralize all pass/chunk related code into reader_impl_chunking.cu --- cpp/CMakeLists.txt | 1 + cpp/src/io/parquet/parquet_gpu.hpp | 73 --- cpp/src/io/parquet/reader_impl.cpp | 12 +- cpp/src/io/parquet/reader_impl.hpp | 61 +- cpp/src/io/parquet/reader_impl_chunking.cu | 598 +++++++++++++++++++ cpp/src/io/parquet/reader_impl_chunking.hpp | 87 +++ cpp/src/io/parquet/reader_impl_helpers.hpp | 17 + cpp/src/io/parquet/reader_impl_preprocess.cu | 558 +---------------- 8 files changed, 755 insertions(+), 652 deletions(-) create mode 100644 cpp/src/io/parquet/reader_impl_chunking.cu create mode 100644 cpp/src/io/parquet/reader_impl_chunking.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 000f80065ab..f8b9762f1d4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -401,6 +401,7 @@ add_library( src/io/parquet/predicate_pushdown.cpp src/io/parquet/reader.cpp src/io/parquet/reader_impl.cpp + src/io/parquet/reader_impl_chunking.cu src/io/parquet/reader_impl_helpers.cpp src/io/parquet/reader_impl_preprocess.cu src/io/parquet/writer_impl.cu diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 767668cc65e..6a93fec0c46 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -318,79 +318,6 @@ struct ColumnChunkDesc { int32_t src_col_schema{}; // my schema index in the file }; -/** - * @brief The row_group_info class - */ -struct row_group_info { - size_type index; // row group index within a file. aggregate_reader_metadata::get_row_group() is - // called with index and source_index - size_t start_row; - size_type source_index; // file index. - - row_group_info() = default; - - row_group_info(size_type index, size_t start_row, size_type source_index) - : index{index}, start_row{start_row}, source_index{source_index} - { - } -}; - -/** - * @brief Struct to store file-level data that remains constant for - * all passes/chunks for the file. - */ -struct file_intermediate_data { - // all row groups to read - std::vector row_groups{}; - - // all chunks from the selected row groups. We may end up reading these chunks progressively - // instead of all at once - std::vector chunks{}; - - // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we - // may not be visiting every row group that contains these bounds - size_t global_skip_rows; - size_t global_num_rows; -}; - -/** - * @brief Structs to identify the reading row range for each chunk of rows in chunked reading. - */ -struct chunk_read_info { - size_t skip_rows; - size_t num_rows; -}; - -/** - * @brief Struct to store pass-level data that remains constant for a single pass. - */ -struct pass_intermediate_data { - std::vector> raw_page_data; - rmm::device_buffer decomp_page_data; - - // rowgroup, chunk and page information for the current pass. - std::vector row_groups{}; - cudf::detail::hostdevice_vector chunks{}; - cudf::detail::hostdevice_vector pages_info{}; - cudf::detail::hostdevice_vector page_nesting_info{}; - cudf::detail::hostdevice_vector page_nesting_decode_info{}; - - rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; - rmm::device_uvector page_index{0, rmm::cuda_stream_default}; - rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; - - std::vector output_chunk_read_info; - std::size_t current_output_chunk{0}; - - rmm::device_buffer level_decode_data{}; - int level_type_size{0}; - - // skip_rows and num_rows values for this particular pass. these may be adjusted values from the - // global values stored in file_intermediate_data. - size_t skip_rows; - size_t num_rows; -}; - /** * @brief Struct describing an encoder column */ diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 34aa4f2201f..44f9c160c25 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -349,14 +349,14 @@ void reader::impl::prepare_data(int64_t skip_rows, not _input_columns.empty()) { // fills in chunk information without physically loading or decompressing // the associated data - load_global_chunk_info(); + create_global_chunk_info(); // compute schedule of input reads. Each rowgroup contains 1 chunk per column. For now // we will read an entire row group at a time. However, it is possible to do // sub-rowgroup reads if we made some estimates on individual chunk sizes (tricky) and // changed the high level structure such that we weren't always reading an entire table's // worth of columns at once. - compute_input_pass_row_group_info(); + compute_input_passes(); } _file_preprocessed = true; @@ -364,7 +364,7 @@ void reader::impl::prepare_data(int64_t skip_rows, // if we have to start a new pass, do that now if (!_pass_preprocessed) { - auto const num_passes = _input_pass_row_group_offsets.size() - 1; + auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; // always create the pass struct, even if we end up with no passes. // this will also cause the previous pass information to be deleted @@ -373,7 +373,7 @@ void reader::impl::prepare_data(int64_t skip_rows, if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() && not _input_columns.empty() && _current_input_pass < num_passes) { // setup the pass_intermediate_info for this pass. - setup_pass(); + setup_next_pass(); load_and_decompress_data(); preprocess_pages(uses_custom_row_bounds, _output_chunk_read_limit); @@ -541,8 +541,8 @@ bool reader::impl::has_next() {} /*row_group_indices, empty means read all row groups*/, std::nullopt /*filter*/); - auto const num_input_passes = - _input_pass_row_group_offsets.size() == 0 ? 0 : _input_pass_row_group_offsets.size() - 1; + size_t const num_input_passes = std::max( + int64_t{0}, static_cast(_file_itm_data.input_pass_row_group_offsets.size()) - 1); return (_pass_itm_data->current_output_chunk < _pass_itm_data->output_chunk_read_info.size()) || (_current_input_pass < num_input_passes); } diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 03990f1a1f3..22217b55411 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -22,6 +22,7 @@ #pragma once #include "parquet_gpu.hpp" +#include "reader_impl_chunking.hpp" #include "reader_impl_helpers.hpp" #include @@ -136,10 +137,6 @@ class reader::impl { host_span const> row_group_indices, std::optional> filter); - void load_global_chunk_info(); - void compute_input_pass_row_group_info(); - void setup_pass(); - /** * @brief Create chunk information and start file reads * @@ -250,6 +247,39 @@ class reader::impl { */ void decode_page_data(size_t skip_rows, size_t num_rows); + /* + * + * + Functions related to computing chunks and passes (reader_impl_chunking.cu) + * + * + */ + + /** + * @brief Creates file-wide parquet chunk information. + * + * Creates information about all chunks in the file, storing it in + * the file-wide _file_itm_data structure. + */ + void create_global_chunk_info(); + + /** + * @brief Computes all of the passes we will perform over the file. + */ + void compute_input_passes(); + + /** + * @brief Close out the existing pass (if any) and prepare for the next pass. + */ + void setup_next_pass(); + + /** + * @brief Given a set of pages that have had their sizes computed by nesting level and + * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing + * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. + */ + void compute_splits_for_pass(); + private: rmm::cuda_stream_view _stream; rmm::mr::device_memory_resource* _mr = nullptr; @@ -278,27 +308,24 @@ class reader::impl { // chunked reading happens in 2 parts: // - // At the top level there is the "pass" in which we try and limit the + // At the top level, the entire file is divided up into "passes" omn which we try and limit the // total amount of temporary memory (compressed data, decompressed data) in use // via _input_pass_read_limit. // // Within a pass, we produce one or more chunks of output, whose maximum total // byte size is controlled by _output_chunk_read_limit. - cudf::io::parquet::detail::file_intermediate_data _file_itm_data; - std::unique_ptr _pass_itm_data; - - // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents - // the start/end of the chunks to be loaded for a given pass. - std::vector _input_pass_row_group_offsets{}; - std::vector _input_pass_row_count{}; - std::size_t _current_input_pass{0}; - std::size_t _chunk_count{0}; + file_intermediate_data _file_itm_data; + bool _file_preprocessed{false}; - std::size_t _output_chunk_read_limit{0}; - std::size_t _input_pass_read_limit{0}; + std::unique_ptr _pass_itm_data; bool _pass_preprocessed{false}; - bool _file_preprocessed{false}; + + std::size_t _output_chunk_read_limit{0}; // output chunk size limit in bytes + std::size_t _input_pass_read_limit{0}; // input pass memory usage limit in bytes + + std::size_t _current_input_pass{0}; // current input pass index + std::size_t _chunk_count{0}; // how many output chunks we have produced }; } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu new file mode 100644 index 00000000000..5d5b152e1aa --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -0,0 +1,598 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "reader_impl.hpp" +#include "reader_impl_chunking.hpp" + +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +namespace { + +struct cumulative_row_info { + size_t row_count; // cumulative row count + size_t size_bytes; // cumulative size in bytes + int key; // schema index +}; + +#if defined(CHUNKING_DEBUG) +void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, + rmm::device_uvector const& page_index, + rmm::device_uvector const& c_info, + rmm::cuda_stream_view stream) +{ + pages.device_to_host_sync(stream); + + printf("------------\nCumulative sizes by page\n"); + + std::vector schemas(pages.size()); + std::vector h_page_index(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); + std::vector h_cinfo(pages.size()); + CUDF_CUDA_TRY(cudaMemcpy( + h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); + auto schema_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); + thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); + auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); + schemas.resize(last - schemas.begin()); + printf("Num schemas: %lu\n", schemas.size()); + + for (size_t idx = 0; idx < schemas.size(); idx++) { + printf("Schema %d\n", schemas[idx]); + for (size_t pidx = 0; pidx < pages.size(); pidx++) { + auto const& page = pages[h_page_index[pidx]]; + if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { + continue; + } + printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); + } + } +} + +void print_cumulative_row_info( + host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) +{ + if (splits.has_value()) { + printf("------------\nSplits\n"); + for (size_t idx = 0; idx < splits->size(); idx++) { + printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); + } + } + + printf("------------\nCumulative sizes %s\n", label.c_str()); + for (size_t idx = 0; idx < sizes.size(); idx++) { + printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); + if (splits.has_value()) { + // if we have a split at this row count and this is the last instance of this row count + auto start = thrust::make_transform_iterator( + splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); + auto end = start + splits->size(); + auto split = std::find(start, end, sizes[idx].row_count); + auto const split_index = [&]() -> int { + if (split != end && + ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { + return static_cast(std::distance(start, split)); + } + return idx == 0 ? 0 : -1; + }(); + if (split_index >= 0) { + printf(" <-- split {%lu, %lu}", + splits.value()[split_index].skip_rows, + splits.value()[split_index].num_rows); + } + } + printf("\n"); + } +} +#endif // CHUNKING_DEBUG + +/** + * @brief Functor which reduces two cumulative_row_info structs of the same key. + */ +struct cumulative_row_sum { + cumulative_row_info operator() + __device__(cumulative_row_info const& a, cumulative_row_info const& b) const + { + return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; + } +}; + +/** + * @brief Functor which computes the total data size for a given type of cudf column. + * + * In the case of strings, the return size does not include the chars themselves. That + * information is tracked separately (see PageInfo::str_bytes). + */ +struct row_size_functor { + __device__ size_t validity_size(size_t num_rows, bool nullable) + { + return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; + } + + template + __device__ size_t operator()(size_t num_rows, bool nullable) + { + auto const element_size = sizeof(device_storage_type_t); + return (element_size * num_rows) + validity_size(num_rows, nullable); + } +}; + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + auto const offset_size = sizeof(size_type); + // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset + // for the entire column, whereas this is adding an extra offset per page. So we will get a + // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better + // to overestimate size somewhat than to underestimate it and potentially generate chunks + // that are too large. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + return validity_size(num_rows, nullable); +} + +template <> +__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) +{ + // only returns the size of offsets and validity. the size of the actual string chars + // is tracked separately. + auto const offset_size = sizeof(size_type); + // see note about offsets in the list_view template. + return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); +} + +/** + * @brief Functor which computes the total output cudf data size for all of + * the data in this page. + * + * Sums across all nesting levels. + */ +struct get_cumulative_row_info { + PageInfo const* const pages; + + __device__ cumulative_row_info operator()(size_type index) + { + auto const& page = pages[index]; + if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { + return cumulative_row_info{0, 0, page.src_col_schema}; + } + + // total nested size, not counting string data + auto iter = + cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { + auto const& pni = page.nesting[i]; + return cudf::type_dispatcher( + data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); + }); + + size_t const row_count = static_cast(page.nesting[0].size); + return { + row_count, + thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, + page.src_col_schema}; + } +}; + +/** + * @brief Functor which computes the effective size of all input columns by page. + * + * For a given row, we want to find the cost of all pages for all columns involved + * in loading up to that row. The complication here is that not all pages are the + * same size between columns. Example: + * + * page row counts + * Column A: 0 <----> 100 <----> 200 + * Column B: 0 <---------------> 200 <--------> 400 + | + * if we decide to split at row 100, we don't really know the actual amount of bytes in column B + * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + * page. Essentially, a conservative over-estimate of the real size. + */ +struct row_total_size { + cumulative_row_info const* c_info; + size_type const* key_offsets; + size_t num_keys; + + __device__ cumulative_row_info operator()(cumulative_row_info const& i) + { + // sum sizes for each input column at this row + size_t sum = 0; + for (int idx = 0; idx < num_keys; idx++) { + auto const start = key_offsets[idx]; + auto const end = key_offsets[idx + 1]; + auto iter = cudf::detail::make_counting_transform_iterator( + 0, [&] __device__(size_type i) { return c_info[i].row_count; }); + auto const page_index = + thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; + sum += c_info[page_index].size_bytes; + } + return {i.row_count, sum, i.key}; + } +}; + +/** + * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read + * limit, determine the set of splits. + * + * @param sizes Vector of cumulative {row_count, byte_size} pairs + * @param num_rows Total number of rows to read + * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns + */ +std::vector find_splits(std::vector const& sizes, + size_t num_rows, + size_t chunk_read_limit) +{ + // now we have an array of {row_count, real output bytes}. just walk through it and generate + // splits. + // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch + // sizes are reasonably large, this shouldn't iterate too many times + std::vector splits; + { + size_t cur_pos = 0; + size_t cur_cumulative_size = 0; + size_t cur_row_count = 0; + auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { + return i.size_bytes - cur_cumulative_size; + }); + auto end = start + sizes.size(); + while (cur_row_count < num_rows) { + int64_t split_pos = + thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; + + // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back + // one. + if (static_cast(split_pos) >= sizes.size() || + (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { + split_pos--; + } + + // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in + // a loop because all of the cumulative sizes for all the pages are sorted into one big list. + // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in + // the list twice. so we have to iterate until we skip past all of them. The idea is that we + // either do this, or we have to call unique() on the input first. + while (split_pos < (static_cast(sizes.size()) - 1) && + (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { + split_pos++; + } + + auto const start_row = cur_row_count; + cur_row_count = sizes[split_pos].row_count; + splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); + cur_pos = split_pos; + cur_cumulative_size = sizes[split_pos].size_bytes; + } + } + // print_cumulative_row_info(sizes, "adjusted", splits); + + return splits; +} + +/** + * @brief Converts cuDF units to Parquet units. + * + * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. + */ +[[nodiscard]] std::tuple conversion_info(type_id column_type_id, + type_id timestamp_type_id, + Type physical, + int8_t converted, + int32_t length) +{ + int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; + int32_t clock_rate = 0; + if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { + type_width = 1; // I32 -> I8 + } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { + type_width = 2; // I32 -> I16 + } else if (column_type_id == type_id::INT32) { + type_width = 4; // str -> hash32 + } else if (is_chrono(data_type{column_type_id})) { + clock_rate = to_clockrate(timestamp_type_id); + } + + int8_t converted_type = converted; + if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && + not cudf::is_fixed_point(data_type{column_type_id})) { + converted_type = UNKNOWN; // Not converting to float64 or decimal + } + return std::make_tuple(type_width, clock_rate, converted_type); +} + +/** + * @brief Return the required number of bits to store a value. + */ +template +[[nodiscard]] T required_bits(uint32_t max_level) +{ + return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); +} + +struct row_count_compare { + __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) + { + return a.row_count < b.row_count; + } +}; + +} // anonymous namespace + +void reader::impl::create_global_chunk_info() +{ + auto const num_rows = _file_itm_data.global_num_rows; + auto const& row_groups_info = _file_itm_data.row_groups; + auto& chunks = _file_itm_data.chunks; + + // Descriptors for all the chunks that make up the selected columns + auto const num_input_columns = _input_columns.size(); + auto const num_chunks = row_groups_info.size() * num_input_columns; + + // Initialize column chunk information + auto remaining_rows = num_rows; + for (auto const& rg : row_groups_info) { + auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); + auto const row_group_start = rg.start_row; + auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); + + // generate ColumnChunkDesc objects for everything to be decoded (all input columns) + for (size_t i = 0; i < num_input_columns; ++i) { + auto col = _input_columns[i]; + // look up metadata + auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); + auto& schema = _metadata->get_schema(col.schema_idx); + + auto [type_width, clock_rate, converted_type] = + conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), + _timestamp_type.id(), + schema.type, + schema.converted_type, + schema.type_length); + + chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); + } + + remaining_rows -= row_group_rows; + } +} + +void reader::impl::compute_input_passes() +{ + // at this point, row_groups has already been filtered down to just the row groups we need to + // handle optional skip_rows/num_rows parameters. + auto const& row_groups_info = _file_itm_data.row_groups; + + // if the user hasn't specified an input size limit, read everything in a single pass. + if (_input_pass_read_limit == 0) { + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + return; + } + + // generate passes. make sure to account for the case where a single row group doesn't fit within + // + std::size_t const read_limit = + _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); + std::size_t cur_pass_byte_size = 0; + std::size_t cur_rg_start = 0; + std::size_t cur_row_count = 0; + _file_itm_data.input_pass_row_group_offsets.push_back(0); + _file_itm_data.input_pass_row_count.push_back(0); + + for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { + auto const& rgi = row_groups_info[cur_rg_index]; + auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); + + // can we add this row group + if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { + // A single row group (the current one) is larger than the read limit: + // We always need to include at least one row group, so end the pass at the end of the current + // row group + if (cur_rg_start == cur_rg_index) { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1); + _file_itm_data.input_pass_row_count.push_back(cur_row_count + row_group.num_rows); + cur_rg_start = cur_rg_index + 1; + cur_pass_byte_size = 0; + } + // End the pass at the end of the previous row group + else { + _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + cur_rg_start = cur_rg_index; + cur_pass_byte_size = row_group.total_byte_size; + } + } else { + cur_pass_byte_size += row_group.total_byte_size; + } + cur_row_count += row_group.num_rows; + } + // add the last pass if necessary + if (_file_itm_data.input_pass_row_group_offsets.back() != row_groups_info.size()) { + _file_itm_data.input_pass_row_group_offsets.push_back(row_groups_info.size()); + _file_itm_data.input_pass_row_count.push_back(cur_row_count); + } +} + +void reader::impl::setup_next_pass() +{ + // this will also cause the previous pass information to be deleted + _pass_itm_data = std::make_unique(); + + // setup row groups to be loaded for this pass + auto const row_group_start = _file_itm_data.input_pass_row_group_offsets[_current_input_pass]; + auto const row_group_end = _file_itm_data.input_pass_row_group_offsets[_current_input_pass + 1]; + auto const num_row_groups = row_group_end - row_group_start; + _pass_itm_data->row_groups.resize(num_row_groups); + std::copy(_file_itm_data.row_groups.begin() + row_group_start, + _file_itm_data.row_groups.begin() + row_group_end, + _pass_itm_data->row_groups.begin()); + + auto const num_passes = _file_itm_data.input_pass_row_group_offsets.size() - 1; + CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); + + auto const chunks_per_rowgroup = _input_columns.size(); + auto const num_chunks = chunks_per_rowgroup * num_row_groups; + + auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); + auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); + + _pass_itm_data->chunks = + cudf::detail::hostdevice_vector(num_chunks, _stream); + std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); + + // adjust skip_rows and num_rows by what's available in the row groups we are processing + if (num_passes == 1) { + _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; + _pass_itm_data->num_rows = _file_itm_data.global_num_rows; + } else { + auto const global_start_row = _file_itm_data.global_skip_rows; + auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; + auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row); + auto const end_row = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row); + + // skip_rows is always global in the sense that it is relative to the first row of + // everything we will be reading, regardless of what pass we are on. + // num_rows is how many rows we are reading this pass. + _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass]; + _pass_itm_data->num_rows = end_row - start_row; + } +} + +void reader::impl::compute_splits_for_pass() +{ + auto const skip_rows = _pass_itm_data->skip_rows; + auto const num_rows = _pass_itm_data->num_rows; + + // simple case : no chunk size, no splits + if(_output_chunk_read_limit <= 0){ + _pass_itm_data->output_chunk_read_info = std::vector{{skip_rows, num_rows}}; + return; + } + + auto& pages = _pass_itm_data->pages_info; + + auto const& page_keys = _pass_itm_data->page_keys; + auto const& page_index = _pass_itm_data->page_index; + + // generate cumulative row counts and sizes + rmm::device_uvector c_info(page_keys.size(), _stream); + // convert PageInfo to cumulative_row_info + auto page_input = thrust::make_transform_iterator(page_index.begin(), + get_cumulative_row_info{pages.device_ptr()}); + thrust::inclusive_scan_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + page_input, + c_info.begin(), + thrust::equal_to{}, + cumulative_row_sum{}); + // print_cumulative_page_info(pages, page_index, c_info, stream); + + // sort by row count + rmm::device_uvector c_info_sorted{c_info, _stream}; + thrust::sort(rmm::exec_policy(_stream), + c_info_sorted.begin(), + c_info_sorted.end(), + row_count_compare{}); + + // std::vector h_c_info_sorted(c_info_sorted.size()); + // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), + // c_info_sorted.data(), + // sizeof(cumulative_row_info) * c_info_sorted.size(), + // cudaMemcpyDefault)); + // print_cumulative_row_info(h_c_info_sorted, "raw"); + + // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per + // key + rmm::device_uvector key_offsets(page_keys.size() + 1, _stream); + auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(_stream), + page_keys.begin(), + page_keys.end(), + thrust::make_constant_iterator(1), + thrust::make_discard_iterator(), + key_offsets.begin()) + .second; + size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); + thrust::exclusive_scan( + rmm::exec_policy(_stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); + + // adjust the cumulative info such that for each row count, the size includes any pages that span + // that row count. this is so that if we have this case: + // page row counts + // Column A: 0 <----> 100 <----> 200 + // Column B: 0 <---------------> 200 <--------> 400 + // | + // if we decide to split at row 100, we don't really know the actual amount of bytes in column B + // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that + // page. + // + rmm::device_uvector aggregated_info(c_info.size(), _stream); + thrust::transform(rmm::exec_policy(_stream), + c_info_sorted.begin(), + c_info_sorted.end(), + aggregated_info.begin(), + row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); + + // bring back to the cpu + std::vector h_aggregated_info(aggregated_info.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), + aggregated_info.data(), + sizeof(cumulative_row_info) * c_info.size(), + cudaMemcpyDefault, + _stream.value())); + _stream.synchronize(); + + // generate the actual splits + _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp new file mode 100644 index 00000000000..f3c595a9a2b --- /dev/null +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "reader_impl_helpers.hpp" + +namespace cudf::io::parquet::detail { + +/** + * @brief Struct to store file-level data that remains constant for + * all passes/chunks in the file. + */ +struct file_intermediate_data { + // all row groups to read + std::vector row_groups{}; + + // all chunks from the selected row groups. We may end up reading these chunks progressively + // instead of all at once + std::vector chunks{}; + + // an array of offsets into _file_itm_data::global_chunks. Each pair of offsets represents + // the start/end of the chunks to be loaded for a given pass. + std::vector input_pass_row_group_offsets{}; + // row counts per input-pass + std::vector input_pass_row_count{}; + + // skip_rows/num_rows values for the entire file. these need to be adjusted per-pass because we + // may not be visiting every row group that contains these bounds + size_t global_skip_rows; + size_t global_num_rows; +}; + +/** + * @brief Struct to identify the range for each chunk of rows during a chunked reading pass. + */ +struct chunk_read_info { + size_t skip_rows; + size_t num_rows; +}; + +/** + * @brief Struct to store pass-level data that remains constant for a single pass. + */ +struct pass_intermediate_data { + std::vector> raw_page_data; + rmm::device_buffer decomp_page_data; + + // rowgroup, chunk and page information for the current pass. + std::vector row_groups{}; + cudf::detail::hostdevice_vector chunks{}; + cudf::detail::hostdevice_vector pages_info{}; + cudf::detail::hostdevice_vector page_nesting_info{}; + cudf::detail::hostdevice_vector page_nesting_decode_info{}; + + rmm::device_uvector page_keys{0, rmm::cuda_stream_default}; + rmm::device_uvector page_index{0, rmm::cuda_stream_default}; + rmm::device_uvector str_dict_index{0, rmm::cuda_stream_default}; + + std::vector output_chunk_read_info; + std::size_t current_output_chunk{0}; + + rmm::device_buffer level_decode_data{}; + int level_type_size{0}; + + // skip_rows and num_rows values for this particular pass. these may be adjusted values from the + // global values stored in file_intermediate_data. + size_t skip_rows; + size_t num_rows; +}; + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index 2ff18bfbe7e..7a4fcc72dce 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -34,6 +34,23 @@ namespace cudf::io::parquet::detail { +/** + * @brief The row_group_info class + */ +struct row_group_info { + size_type index; // row group index within a file. aggregate_reader_metadata::get_row_group() is + // called with index and source_index + size_t start_row; + size_type source_index; // file index. + + row_group_info() = default; + + row_group_info(size_type index, size_t start_row, size_type source_index) + : index{index}, start_row{start_row}, source_index{source_index} + { + } +}; + /** * @brief Function that translates Parquet datatype to cuDF type enum */ diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 4bc6bb6f43b..ce45f709ee1 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -44,7 +43,6 @@ #include namespace cudf::io::parquet::detail { - namespace { /** @@ -170,46 +168,6 @@ void generate_depth_remappings(std::map, std::ve } } -/** - * @brief Return the required number of bits to store a value. - */ -template -[[nodiscard]] T required_bits(uint32_t max_level) -{ - return static_cast(CompactProtocolReader::NumRequiredBits(max_level)); -} - -/** - * @brief Converts cuDF units to Parquet units. - * - * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type. - */ -[[nodiscard]] std::tuple conversion_info(type_id column_type_id, - type_id timestamp_type_id, - Type physical, - int8_t converted, - int32_t length) -{ - int32_t type_width = (physical == FIXED_LEN_BYTE_ARRAY) ? length : 0; - int32_t clock_rate = 0; - if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) { - type_width = 1; // I32 -> I8 - } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) { - type_width = 2; // I32 -> I16 - } else if (column_type_id == type_id::INT32) { - type_width = 4; // str -> hash32 - } else if (is_chrono(data_type{column_type_id})) { - clock_rate = to_clockrate(timestamp_type_id); - } - - int8_t converted_type = converted; - if (converted_type == DECIMAL && column_type_id != type_id::FLOAT64 && - not cudf::is_fixed_point(data_type{column_type_id})) { - converted_type = UNKNOWN; // Not converting to float64 or decimal - } - return std::make_tuple(type_width, clock_rate, converted_type); -} - /** * @brief Reads compressed page data to device memory. * @@ -790,163 +748,6 @@ std::pair>> reader::impl::read_and_decompres return {total_decompressed_size > 0, std::move(read_chunk_tasks)}; } -void reader::impl::load_global_chunk_info() -{ - auto const num_rows = _file_itm_data.global_num_rows; - auto const& row_groups_info = _file_itm_data.row_groups; - auto& chunks = _file_itm_data.chunks; - - // Descriptors for all the chunks that make up the selected columns - auto const num_input_columns = _input_columns.size(); - auto const num_chunks = row_groups_info.size() * num_input_columns; - - // Initialize column chunk information - auto remaining_rows = num_rows; - for (auto const& rg : row_groups_info) { - auto const& row_group = _metadata->get_row_group(rg.index, rg.source_index); - auto const row_group_start = rg.start_row; - auto const row_group_rows = std::min(remaining_rows, row_group.num_rows); - - // generate ColumnChunkDesc objects for everything to be decoded (all input columns) - for (size_t i = 0; i < num_input_columns; ++i) { - auto col = _input_columns[i]; - // look up metadata - auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx); - auto& schema = _metadata->get_schema(col.schema_idx); - - auto [type_width, clock_rate, converted_type] = - conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()), - _timestamp_type.id(), - schema.type, - schema.converted_type, - schema.type_length); - - chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); - } - - remaining_rows -= row_group_rows; - } -} - -void reader::impl::compute_input_pass_row_group_info() -{ - // at this point, row_groups has already been filtered down to just the row groups we need to - // handle optional skip_rows/num_rows parameters. - auto const& row_groups_info = _file_itm_data.row_groups; - - // if the user hasn't specified an input size limit, read everything in a single pass. - if (_input_pass_read_limit == 0) { - _input_pass_row_group_offsets.push_back(0); - _input_pass_row_group_offsets.push_back(row_groups_info.size()); - return; - } - - // generate passes. make sure to account for the case where a single row group doesn't fit within - // - std::size_t const read_limit = - _input_pass_read_limit > 0 ? _input_pass_read_limit : std::numeric_limits::max(); - std::size_t cur_pass_byte_size = 0; - std::size_t cur_rg_start = 0; - std::size_t cur_row_count = 0; - _input_pass_row_group_offsets.push_back(0); - _input_pass_row_count.push_back(0); - - for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) { - auto const& rgi = row_groups_info[cur_rg_index]; - auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index); - - // can we add this row group - if (cur_pass_byte_size + row_group.total_byte_size >= read_limit) { - // A single row group (the current one) is larger than the read limit: - // We always need to include at least one row group, so end the pass at the end of the current - // row group - if (cur_rg_start == cur_rg_index) { - _input_pass_row_group_offsets.push_back(cur_rg_index + 1); - _input_pass_row_count.push_back(cur_row_count + row_group.num_rows); - cur_rg_start = cur_rg_index + 1; - cur_pass_byte_size = 0; - } - // End the pass at the end of the previous row group - else { - _input_pass_row_group_offsets.push_back(cur_rg_index); - _input_pass_row_count.push_back(cur_row_count); - cur_rg_start = cur_rg_index; - cur_pass_byte_size = row_group.total_byte_size; - } - } else { - cur_pass_byte_size += row_group.total_byte_size; - } - cur_row_count += row_group.num_rows; - } - // add the last pass if necessary - if (_input_pass_row_group_offsets.back() != row_groups_info.size()) { - _input_pass_row_group_offsets.push_back(row_groups_info.size()); - _input_pass_row_count.push_back(cur_row_count); - } -} - -void reader::impl::setup_pass() -{ - // this will also cause the previous pass information to be deleted - _pass_itm_data = std::make_unique(); - - // setup row groups to be loaded for this pass - auto const row_group_start = _input_pass_row_group_offsets[_current_input_pass]; - auto const row_group_end = _input_pass_row_group_offsets[_current_input_pass + 1]; - auto const num_row_groups = row_group_end - row_group_start; - _pass_itm_data->row_groups.resize(num_row_groups); - std::copy(_file_itm_data.row_groups.begin() + row_group_start, - _file_itm_data.row_groups.begin() + row_group_end, - _pass_itm_data->row_groups.begin()); - - auto const num_passes = _input_pass_row_group_offsets.size() - 1; - CUDF_EXPECTS(_current_input_pass < num_passes, "Encountered an invalid read pass index"); - - auto const chunks_per_rowgroup = _input_columns.size(); - auto const num_chunks = chunks_per_rowgroup * num_row_groups; - - auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); - auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - - _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); - std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); - - // adjust skip_rows and num_rows by what's available in the row groups we are processing - if (num_passes == 1) { - _pass_itm_data->skip_rows = _file_itm_data.global_skip_rows; - _pass_itm_data->num_rows = _file_itm_data.global_num_rows; - } else { - auto const global_start_row = _file_itm_data.global_skip_rows; - auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; - auto const start_row = std::max(_input_pass_row_count[_current_input_pass], global_start_row); - auto const end_row = std::min(_input_pass_row_count[_current_input_pass + 1], global_end_row); - - // skip_rows is always global in the sense that it is relative to the first row of - // everything we will be reading, regardless of what pass we are on. - // num_rows is how many rows we are reading this pass. - _pass_itm_data->skip_rows = global_start_row + _input_pass_row_count[_current_input_pass]; - _pass_itm_data->num_rows = end_row - start_row; - } -} - void reader::impl::load_and_decompress_data() { // This function should never be called if `num_rows == 0`. @@ -1034,359 +835,8 @@ void print_pages(cudf::detail::hostdevice_vector& pages, rmm::cuda_str p.str_bytes); } } - -void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages, - rmm::device_uvector const& page_index, - rmm::device_uvector const& c_info, - rmm::cuda_stream_view stream) -{ - pages.device_to_host_sync(stream); - - printf("------------\nCumulative sizes by page\n"); - - std::vector schemas(pages.size()); - std::vector h_page_index(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDefault)); - std::vector h_cinfo(pages.size()); - CUDF_CUDA_TRY(cudaMemcpy( - h_cinfo.data(), c_info.data(), sizeof(cumulative_row_info) * pages.size(), cudaMemcpyDefault)); - auto schema_iter = cudf::detail::make_counting_transform_iterator( - 0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; }); - thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin()); - auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end()); - schemas.resize(last - schemas.begin()); - printf("Num schemas: %lu\n", schemas.size()); - - for (size_t idx = 0; idx < schemas.size(); idx++) { - printf("Schema %d\n", schemas[idx]); - for (size_t pidx = 0; pidx < pages.size(); pidx++) { - auto const& page = pages[h_page_index[pidx]]; - if (page.flags & PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) { - continue; - } - printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes); - } - } -} - -void print_cumulative_row_info(host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) -{ - if (splits.has_value()) { - printf("------------\nSplits\n"); - for (size_t idx = 0; idx < splits->size(); idx++) { - printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows); - } - } - - printf("------------\nCumulative sizes %s\n", label.c_str()); - for (size_t idx = 0; idx < sizes.size(); idx++) { - printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key); - if (splits.has_value()) { - // if we have a split at this row count and this is the last instance of this row count - auto start = thrust::make_transform_iterator( - splits->begin(), [](chunk_read_info const& i) { return i.skip_rows; }); - auto end = start + splits->size(); - auto split = std::find(start, end, sizes[idx].row_count); - auto const split_index = [&]() -> int { - if (split != end && - ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) { - return static_cast(std::distance(start, split)); - } - return idx == 0 ? 0 : -1; - }(); - if (split_index >= 0) { - printf(" <-- split {%lu, %lu}", - splits.value()[split_index].skip_rows, - splits.value()[split_index].num_rows); - } - } - printf("\n"); - } -} #endif // PREPROCESS_DEBUG -/** - * @brief Functor which reduces two cumulative_row_info structs of the same key. - */ -struct cumulative_row_sum { - cumulative_row_info operator() - __device__(cumulative_row_info const& a, cumulative_row_info const& b) const - { - return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key}; - } -}; - -/** - * @brief Functor which computes the total data size for a given type of cudf column. - * - * In the case of strings, the return size does not include the chars themselves. That - * information is tracked separately (see PageInfo::str_bytes). - */ -struct row_size_functor { - __device__ size_t validity_size(size_t num_rows, bool nullable) - { - return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0; - } - - template - __device__ size_t operator()(size_t num_rows, bool nullable) - { - auto const element_size = sizeof(device_storage_type_t); - return (element_size * num_rows) + validity_size(num_rows, nullable); - } -}; - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - auto const offset_size = sizeof(size_type); - // NOTE: Adding the + 1 offset here isn't strictly correct. There will only be 1 extra offset - // for the entire column, whereas this is adding an extra offset per page. So we will get a - // small over-estimate of the real size of the order : # of pages * 4 bytes. It seems better - // to overestimate size somewhat than to underestimate it and potentially generate chunks - // that are too large. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - return validity_size(num_rows, nullable); -} - -template <> -__device__ size_t row_size_functor::operator()(size_t num_rows, bool nullable) -{ - // only returns the size of offsets and validity. the size of the actual string chars - // is tracked separately. - auto const offset_size = sizeof(size_type); - // see note about offsets in the list_view template. - return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable); -} - -/** - * @brief Functor which computes the total output cudf data size for all of - * the data in this page. - * - * Sums across all nesting levels. - */ -struct get_cumulative_row_info { - PageInfo const* const pages; - - __device__ cumulative_row_info operator()(size_type index) - { - auto const& page = pages[index]; - if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { - return cumulative_row_info{0, 0, page.src_col_schema}; - } - - // total nested size, not counting string data - auto iter = - cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) { - auto const& pni = page.nesting[i]; - return cudf::type_dispatcher( - data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable); - }); - - size_t const row_count = static_cast(page.nesting[0].size); - return { - row_count, - thrust::reduce(thrust::seq, iter, iter + page.num_output_nesting_levels) + page.str_bytes, - page.src_col_schema}; - } -}; - -/** - * @brief Functor which computes the effective size of all input columns by page. - * - * For a given row, we want to find the cost of all pages for all columns involved - * in loading up to that row. The complication here is that not all pages are the - * same size between columns. Example: - * - * page row counts - * Column A: 0 <----> 100 <----> 200 - * Column B: 0 <---------------> 200 <--------> 400 - | - * if we decide to split at row 100, we don't really know the actual amount of bytes in column B - * at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - * page. Essentially, a conservative over-estimate of the real size. - */ -struct row_total_size { - cumulative_row_info const* c_info; - size_type const* key_offsets; - size_t num_keys; - - __device__ cumulative_row_info operator()(cumulative_row_info const& i) - { - // sum sizes for each input column at this row - size_t sum = 0; - for (int idx = 0; idx < num_keys; idx++) { - auto const start = key_offsets[idx]; - auto const end = key_offsets[idx + 1]; - auto iter = cudf::detail::make_counting_transform_iterator( - 0, [&] __device__(size_type i) { return c_info[i].row_count; }); - auto const page_index = - thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter; - sum += c_info[page_index].size_bytes; - } - return {i.row_count, sum, i.key}; - } -}; - -/** - * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read - * limit, determine the set of splits. - * - * @param sizes Vector of cumulative {row_count, byte_size} pairs - * @param num_rows Total number of rows to read - * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns - */ -std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) -{ - // now we have an array of {row_count, real output bytes}. just walk through it and generate - // splits. - // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch - // sizes are reasonably large, this shouldn't iterate too many times - std::vector splits; - { - size_t cur_pos = 0; - size_t cur_cumulative_size = 0; - size_t cur_row_count = 0; - auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) { - return i.size_bytes - cur_cumulative_size; - }); - auto end = start + sizes.size(); - while (cur_row_count < num_rows) { - int64_t split_pos = - thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start; - - // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back - // one. - if (static_cast(split_pos) >= sizes.size() || - (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) { - split_pos--; - } - - // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in - // a loop because all of the cumulative sizes for all the pages are sorted into one big list. - // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in - // the list twice. so we have to iterate until we skip past all of them. The idea is that we - // either do this, or we have to call unique() on the input first. - while (split_pos < (static_cast(sizes.size()) - 1) && - (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) { - split_pos++; - } - - auto const start_row = cur_row_count; - cur_row_count = sizes[split_pos].row_count; - splits.push_back(chunk_read_info{start_row, cur_row_count - start_row}); - cur_pos = split_pos; - cur_cumulative_size = sizes[split_pos].size_bytes; - } - } - // print_cumulative_row_info(sizes, "adjusted", splits); - - return splits; -} - -/** - * @brief Given a set of pages that have had their sizes computed by nesting level and - * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing - * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes. - * - * @param pages All pages in the file - * @param id Additional intermediate information required to process the pages - * @param num_rows Total number of rows to read - * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns - * @param stream CUDA stream to use - */ -std::vector compute_splits(cudf::detail::hostdevice_vector& pages, - pass_intermediate_data const& id, - size_t num_rows, - size_t chunk_read_limit, - rmm::cuda_stream_view stream) -{ - auto const& page_keys = id.page_keys; - auto const& page_index = id.page_index; - - // generate cumulative row counts and sizes - rmm::device_uvector c_info(page_keys.size(), stream); - // convert PageInfo to cumulative_row_info - auto page_input = thrust::make_transform_iterator(page_index.begin(), - get_cumulative_row_info{pages.device_ptr()}); - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - page_input, - c_info.begin(), - thrust::equal_to{}, - cumulative_row_sum{}); - // print_cumulative_page_info(pages, page_index, c_info, stream); - - // sort by row count - rmm::device_uvector c_info_sorted{c_info, stream}; - thrust::sort(rmm::exec_policy(stream), - c_info_sorted.begin(), - c_info_sorted.end(), - [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) { - return a.row_count < b.row_count; - }); - - // std::vector h_c_info_sorted(c_info_sorted.size()); - // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), - // c_info_sorted.data(), - // sizeof(cumulative_row_info) * c_info_sorted.size(), - // cudaMemcpyDefault)); - // print_cumulative_row_info(h_c_info_sorted, "raw"); - - // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per - // key - rmm::device_uvector key_offsets(page_keys.size() + 1, stream); - auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream), - page_keys.begin(), - page_keys.end(), - thrust::make_constant_iterator(1), - thrust::make_discard_iterator(), - key_offsets.begin()) - .second; - size_t const num_unique_keys = key_offsets_end - key_offsets.begin(); - thrust::exclusive_scan( - rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin()); - - // adjust the cumulative info such that for each row count, the size includes any pages that span - // that row count. this is so that if we have this case: - // page row counts - // Column A: 0 <----> 100 <----> 200 - // Column B: 0 <---------------> 200 <--------> 400 - // | - // if we decide to split at row 100, we don't really know the actual amount of bytes in column B - // at that point. So we have to proceed as if we are taking the bytes from all 200 rows of that - // page. - // - rmm::device_uvector aggregated_info(c_info.size(), stream); - thrust::transform(rmm::exec_policy(stream), - c_info_sorted.begin(), - c_info_sorted.end(), - aggregated_info.begin(), - row_total_size{c_info.data(), key_offsets.data(), num_unique_keys}); - - // bring back to the cpu - std::vector h_aggregated_info(aggregated_info.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_aggregated_info.data(), - aggregated_info.data(), - sizeof(cumulative_row_info) * c_info.size(), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - - return find_splits(h_aggregated_info, num_rows, chunk_read_limit); -} - struct get_page_chunk_idx { __device__ size_type operator()(PageInfo const& page) { return page.chunk_idx; } }; @@ -1822,12 +1272,8 @@ void reader::impl::preprocess_pages(bool uses_custom_row_bounds, size_t chunk_re _pass_itm_data->page_keys = std::move(page_keys); _pass_itm_data->page_index = std::move(page_index); - // compute splits if necessary. otherwise return a single split representing - // the whole file. - _pass_itm_data->output_chunk_read_info = - _output_chunk_read_limit > 0 - ? compute_splits(pages, *_pass_itm_data, num_rows, chunk_read_limit, _stream) - : std::vector{{skip_rows, num_rows}}; + // compute splits for the pass + compute_splits_for_pass(); } void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds) From f1378e5b6543d846c4dcbfcdc3fa9ecc256fda60 Mon Sep 17 00:00:00 2001 From: db Date: Mon, 9 Oct 2023 11:07:57 -0500 Subject: [PATCH 4/6] Formatting. --- cpp/src/io/parquet/reader_impl_chunking.cu | 78 ++++++++++----------- cpp/src/io/parquet/reader_impl_chunking.hpp | 2 +- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 5d5b152e1aa..ad52a7dfcc1 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -75,10 +75,9 @@ void print_cumulative_page_info(cudf::detail::hostdevice_vector& pages } } -void print_cumulative_row_info( - host_span sizes, - std::string const& label, - std::optional> splits = std::nullopt) +void print_cumulative_row_info(host_span sizes, + std::string const& label, + std::optional> splits = std::nullopt) { if (splits.has_value()) { printf("------------\nSplits\n"); @@ -251,8 +250,8 @@ struct row_total_size { * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns */ std::vector find_splits(std::vector const& sizes, - size_t num_rows, - size_t chunk_read_limit) + size_t num_rows, + size_t chunk_read_limit) { // now we have an array of {row_count, real output bytes}. just walk through it and generate // splits. @@ -341,7 +340,7 @@ template } struct row_count_compare { - __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) + __device__ bool operator()(cumulative_row_info const& a, cumulative_row_info const& b) { return a.row_count < b.row_count; } @@ -381,24 +380,24 @@ void reader::impl::create_global_chunk_info() schema.type_length); chunks.push_back(ColumnChunkDesc(col_meta.total_compressed_size, - nullptr, - col_meta.num_values, - schema.type, - type_width, - row_group_start, - row_group_rows, - schema.max_definition_level, - schema.max_repetition_level, - _metadata->get_output_nesting_depth(col.schema_idx), - required_bits(schema.max_definition_level), - required_bits(schema.max_repetition_level), - col_meta.codec, - converted_type, - schema.logical_type, - schema.decimal_precision, - clock_rate, - i, - col.schema_idx)); + nullptr, + col_meta.num_values, + schema.type, + type_width, + row_group_start, + row_group_rows, + schema.max_definition_level, + schema.max_repetition_level, + _metadata->get_output_nesting_depth(col.schema_idx), + required_bits(schema.max_definition_level), + required_bits(schema.max_repetition_level), + col_meta.codec, + converted_type, + schema.logical_type, + schema.decimal_precision, + clock_rate, + i, + col.schema_idx)); } remaining_rows -= row_group_rows; @@ -485,8 +484,7 @@ void reader::impl::setup_next_pass() auto chunk_start = _file_itm_data.chunks.begin() + (row_group_start * chunks_per_rowgroup); auto chunk_end = _file_itm_data.chunks.begin() + (row_group_end * chunks_per_rowgroup); - _pass_itm_data->chunks = - cudf::detail::hostdevice_vector(num_chunks, _stream); + _pass_itm_data->chunks = cudf::detail::hostdevice_vector(num_chunks, _stream); std::copy(chunk_start, chunk_end, _pass_itm_data->chunks.begin()); // adjust skip_rows and num_rows by what's available in the row groups we are processing @@ -496,29 +494,32 @@ void reader::impl::setup_next_pass() } else { auto const global_start_row = _file_itm_data.global_skip_rows; auto const global_end_row = global_start_row + _file_itm_data.global_num_rows; - auto const start_row = std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row); - auto const end_row = std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row); + auto const start_row = + std::max(_file_itm_data.input_pass_row_count[_current_input_pass], global_start_row); + auto const end_row = + std::min(_file_itm_data.input_pass_row_count[_current_input_pass + 1], global_end_row); // skip_rows is always global in the sense that it is relative to the first row of // everything we will be reading, regardless of what pass we are on. // num_rows is how many rows we are reading this pass. - _pass_itm_data->skip_rows = global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass]; - _pass_itm_data->num_rows = end_row - start_row; + _pass_itm_data->skip_rows = + global_start_row + _file_itm_data.input_pass_row_count[_current_input_pass]; + _pass_itm_data->num_rows = end_row - start_row; } } void reader::impl::compute_splits_for_pass() { auto const skip_rows = _pass_itm_data->skip_rows; - auto const num_rows = _pass_itm_data->num_rows; + auto const num_rows = _pass_itm_data->num_rows; // simple case : no chunk size, no splits - if(_output_chunk_read_limit <= 0){ + if (_output_chunk_read_limit <= 0) { _pass_itm_data->output_chunk_read_info = std::vector{{skip_rows, num_rows}}; return; } - auto& pages = _pass_itm_data->pages_info; + auto& pages = _pass_itm_data->pages_info; auto const& page_keys = _pass_itm_data->page_keys; auto const& page_index = _pass_itm_data->page_index; @@ -539,10 +540,8 @@ void reader::impl::compute_splits_for_pass() // sort by row count rmm::device_uvector c_info_sorted{c_info, _stream}; - thrust::sort(rmm::exec_policy(_stream), - c_info_sorted.begin(), - c_info_sorted.end(), - row_count_compare{}); + thrust::sort( + rmm::exec_policy(_stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_compare{}); // std::vector h_c_info_sorted(c_info_sorted.size()); // CUDF_CUDA_TRY(cudaMemcpy(h_c_info_sorted.data(), @@ -592,7 +591,8 @@ void reader::impl::compute_splits_for_pass() _stream.synchronize(); // generate the actual splits - _pass_itm_data->output_chunk_read_info = find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit); + _pass_itm_data->output_chunk_read_info = + find_splits(h_aggregated_info, num_rows, _output_chunk_read_limit); } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index f3c595a9a2b..29a91c4cb00 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -84,4 +84,4 @@ struct pass_intermediate_data { size_t num_rows; }; -} // namespace cudf::io::parquet::detail +} // namespace cudf::io::parquet::detail From 79ae066888c597bab034df511044e1ed2f654be0 Mon Sep 17 00:00:00 2001 From: db Date: Mon, 9 Oct 2023 16:50:47 -0500 Subject: [PATCH 5/6] Remove unnecessary comment block. --- cpp/src/io/parquet/reader_impl.hpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 22217b55411..cea4ba35606 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -247,14 +247,6 @@ class reader::impl { */ void decode_page_data(size_t skip_rows, size_t num_rows); - /* - * - * - Functions related to computing chunks and passes (reader_impl_chunking.cu) - * - * - */ - /** * @brief Creates file-wide parquet chunk information. * From 85b1e839eaf2719673cf5d1fda6bb092c6d56ae8 Mon Sep 17 00:00:00 2001 From: db Date: Tue, 10 Oct 2023 10:04:07 -0500 Subject: [PATCH 6/6] Change include file ordering --- cpp/src/io/parquet/reader_impl_chunking.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index 29a91c4cb00..dfc239d8451 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -16,10 +16,10 @@ #pragma once -#include - #include "reader_impl_helpers.hpp" +#include + namespace cudf::io::parquet::detail { /**