From a8ae8cc88bd46ab0229361395d942833cd188435 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 10 Mar 2023 14:48:26 -0600 Subject: [PATCH 01/31] Chunked pack implementation Co-authored-by: Dave Baranec Signed-off-by: Alessandro Bellina --- cpp/benchmarks/copying/contiguous_split.cu | 92 +- cpp/include/cudf/contiguous_split.hpp | 135 ++ cpp/include/cudf/detail/contiguous_split.hpp | 2 +- cpp/src/copying/contiguous_split.cu | 1435 +++++++++++++----- cpp/src/copying/pack.cpp | 2 + cpp/tests/copying/split_tests.cpp | 953 +++++++++--- 6 files changed, 2033 insertions(+), 586 deletions(-) diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu index aff90039cb9..5b95370faf0 100644 --- a/cpp/benchmarks/copying/contiguous_split.cu +++ b/cpp/benchmarks/copying/contiguous_split.cu @@ -25,12 +25,32 @@ #include -template +void contiguous_split(cudf::table_view const& src_table, std::vector const& splits) +{ + auto result = cudf::contiguous_split(src_table, splits); +} + +void chunked_pack(cudf::table_view const& src_table, std::vector const&) +{ + auto mr = rmm::mr::get_current_device_resource(); + auto stream = cudf::get_default_stream(); + rmm::device_buffer user_buffer(100L * 1024 * 1024, stream, mr); + auto chunked_pack = cudf::make_chunked_pack(src_table, user_buffer.size(), mr); + auto user_buffer_span = + cudf::device_span(static_cast(user_buffer.data()), user_buffer.size()); + while (chunked_pack->has_next()) { + auto iter_size = chunked_pack->next(user_buffer_span); + } + stream.synchronize(); +} + +template void BM_contiguous_split_common(benchmark::State& state, std::vector& src_cols, int64_t num_rows, int64_t num_splits, - int64_t bytes_total) + int64_t bytes_total, + ContigSplitImpl& impl) { // generate splits std::vector splits; @@ -57,7 +77,7 @@ void BM_contiguous_split_common(benchmark::State& state, for (auto _ : state) { cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - auto result = cudf::contiguous_split(src_table, splits); + impl(src_table, splits); } // it's 2x bytes_total because we're both reading and writing. @@ -65,8 +85,10 @@ void BM_contiguous_split_common(benchmark::State& state, } class ContiguousSplit : public cudf::benchmark {}; +class ChunkedPack : public cudf::benchmark {}; -void BM_contiguous_split(benchmark::State& state) +template +void BM_contiguous_split(benchmark::State& state, ContiguousSplitImpl& impl) { int64_t const total_desired_bytes = state.range(0); cudf::size_type const num_cols = state.range(1); @@ -91,12 +113,14 @@ void BM_contiguous_split(benchmark::State& state) (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols) : 0); - BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes); + BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl); } class ContiguousSplitStrings : public cudf::benchmark {}; +class ChunkedPackStrings : public cudf::benchmark {}; -void BM_contiguous_split_strings(benchmark::State& state) +template +void BM_contiguous_split_strings(benchmark::State& state, ContiguousSplitImpl& impl) { int64_t const total_desired_bytes = state.range(0); cudf::size_type const num_cols = state.range(1); @@ -133,13 +157,13 @@ void BM_contiguous_split_strings(benchmark::State& state) (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols) : 0); - BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes); + BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes, impl); } #define CSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ BENCHMARK_DEFINE_F(ContiguousSplit, name)(::benchmark::State & state) \ { \ - BM_contiguous_split(state); \ + BM_contiguous_split(state, contiguous_split); \ } \ BENCHMARK_REGISTER_F(ContiguousSplit, name) \ ->Args({size, num_columns, num_splits, validity}) \ @@ -168,7 +192,7 @@ CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, #define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state) \ { \ - BM_contiguous_split_strings(state); \ + BM_contiguous_split_strings(state, contiguous_split); \ } \ BENCHMARK_REGISTER_F(ContiguousSplitStrings, name) \ ->Args({size, num_columns, num_splits, validity}) \ @@ -189,3 +213,53 @@ CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 10 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0); CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); + +#define CCSBM_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ + BENCHMARK_DEFINE_F(ChunkedPack, name)(::benchmark::State & state) \ + { \ + BM_contiguous_split(state, chunked_pack); \ + } \ + BENCHMARK_REGISTER_F(ChunkedPack, name) \ + ->Args({size, num_columns, num_splits, validity}) \ + ->Unit(benchmark::kMillisecond) \ + ->UseManualTime() \ + ->Iterations(8) +CCSBM_BENCHMARK_DEFINE(6Gb512ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_BENCHMARK_DEFINE(6Gb512ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_BENCHMARK_DEFINE(6Gb10ColsNoValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 0, 1); + +CCSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1); + +CCSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); + +#define CCSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \ + BENCHMARK_DEFINE_F(ChunkedPackStrings, name)(::benchmark::State & state) \ + { \ + BM_contiguous_split_strings(state, chunked_pack); \ + } \ + BENCHMARK_REGISTER_F(ChunkedPackStrings, name) \ + ->Args({size, num_columns, num_splits, validity}) \ + ->Unit(benchmark::kMillisecond) \ + ->UseManualTime() \ + ->Iterations(8) + +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1); + +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 0); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 0, 1); +CCSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidity, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1); diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index 62d668a98cb..eacfcfcb803 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -127,6 +127,141 @@ std::vector contiguous_split( std::vector const& splits, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +namespace detail { +struct contiguous_split_state; +}; + +/** + * @brief Perform a chunked "pack" operation of the input `table_view` using a user provided + * buffer of size `user_buffer_size`. + * + * The intent of this operation is to be used in a streamed fashion at times of GPU + * out-of-memory, where we want to minimize the number of small cudaMemcpy calls and + * tracking of all the metadata associated with cudf tables. Because of the memory constraints, + * all thrust and scratch memory allocations are using the passed-in memory resource exclusively, + * not a per-device memory resource. + * + * The caller has two methods it can use to carry out the chunked_pack: has_next and next. + * Here is an example: + * + * // Create a table_view + * cudf::table_view tv = ...; + * + * // Choose a memory resource. This memory resource is used for scratch/thrust temporary + * // data. In memory constrained cases, this can be used to set aside scratch memory + * // for `chunked_pack` at the beginning of a program. + * auto mr = rmm::mr::get_current_device_resource(); + * + * auto stream = cudf::get_default_stream(); + * + * // Define a bounce buffer size: the larger the bounce buffer is, the more SMs can be + * // occupied by this algorithm. + * std::size_t user_buffer_size = 128*1024*1024; + * + * auto chunked_packer = make_chunked_pack(tv, user_buffer_size, stream, mr); + * + * std::size_t host_offset = 0; + * auto host_buffer = ...; // obtain a host buffer you would like to copy to + * + * while (chunked_packer->has_next()) { + * // get a user buffer of size `user_buffer_size` + * cudf::device_span user_buffer = ...; + * std::size_t bytes_copied = chunked_packer->next(user_buffer); + * + * // buffer will hold the contents of at most `user_buffer_size` bytes + * // of the contiguously packed input `table_view`. You are now free to copy + * // this memory somewhere else, for example, to host. + * cudaMemcpyAsync( + * host_buffer.data() + host_offset, + * user_buffer.data(), + * bytes_copied, + * cudaMemcpyDefault, + * stream); + * + * host_offset += bytes_copied; + * } + */ +class chunked_pack { + public: + /** + * @brief Construct a `chunked_pack` class. + * + * @param input source `table_view` to pack + * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be + * at least 1MB + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr RMM memory resource to be used for temporary and scratch allocations only + */ + explicit chunked_pack(cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + + /** + * @brief Destructor that will be implemented as default, required because + * contiguous_split_state is incomplete at this stage. + */ + ~chunked_pack(); + + /** + * @brief Obtain the total size of the contiguously packed `table_view`. + * + * @return total size (in bytes) of all the chunks + */ + [[nodiscard]] std::size_t get_total_contiguous_size() const; + + /** + * @brief Function to check if there are chunks left to be copied. + * + * @return true if there are chunks left to be copied, and false otherwise + */ + [[nodiscard]] bool has_next() const; + + /** + * @brief Packs the next chunk into `user_buffer`. This should be call as long as + * `has_next` returns true. If `next` is called when `has_next` is false, an exception + * is thrown. + * + * @throws cudf::logic_error If the size of `user_buffer` is different than `user_buffer_size` + * @throws cudf::logic_error If called after all chunks have been copied + * + * @param user_buffer device span representing a bounce buffer. The size of this span + * must equal the `user_buffer_size` parameter passed at construction + * @return The number of bytes that were written to `user_buffer` (at most + * `user_buffer_size`) + * + */ + [[nodiscard]] std::size_t next(cudf::device_span const& user_buffer); + + /** + * @brief Build the opaque metadata for all added columns. + * + * @return A vector containing the serialized column metadata + */ + [[nodiscard]] std::unique_ptr> build_metadata() const; + + private: + // internal state of contiguous split + std::unique_ptr state; +}; + +/** + * @brief Created a `chunked_pack` instance to perform a "pack" of the `table_view` + * "input", where a bounce buffer of `user_buffer_size` is filled with chunks of the + * overall operation. This operation can be used in cases where GPU memory is constrained. + * + * @throws cudf::logic_error When user_buffer_size is less than 1MB + * + * @param input source `table_view` to pack + * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be + * at least 1MB + * @param mr RMM memory resource to be used for temporary and scratch allocations only + * @return a unique_ptr of chunked_pack + */ +std::unique_ptr make_chunked_pack(cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* mr); + /** * @brief Deep-copy a `table_view` into a serialized contiguous memory format. * diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp index 4c6d19739cf..8ba76a5e8af 100644 --- a/cpp/include/cudf/detail/contiguous_split.hpp +++ b/cpp/include/cudf/detail/contiguous_split.hpp @@ -67,7 +67,7 @@ class metadata_builder { * @brief Destructor that will be implemented as default, required because metadata_builder_impl * is incomplete at this stage. */ - ~metadata_builder() = default; + ~metadata_builder(); /** * @brief Add a column to this metadata builder. diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 4c3b4eddb8d..e94ce1d8156 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -43,6 +42,7 @@ #include #include #include +#include #include #include @@ -270,7 +270,7 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, * * @param src_bufs Input source buffers * @param dst_bufs Destination buffers - * @param buf_info Information on the range of values to be copied for each destination buffer. + * @param buf_info Information on the range of values to be copied for each destination buffer */ template __global__ void copy_partitions(uint8_t const** src_bufs, @@ -296,6 +296,43 @@ __global__ void copy_partitions(uint8_t const** src_bufs, buf_info[buf_index].valid_count > 0 ? &buf_info[buf_index].valid_count : nullptr); } +/** + * @brief Kernel which copies data from multiple source buffers to multiple + * destination buffers. + * + * When doing a contiguous_split on X columns comprising N total internal buffers + * with M splits, we end up having to copy N*M source/destination buffer pairs. + * These copies are further subdivided into batches to distribute the amount of work + * to be done as evenly as possible across the multiprocessors on the device. + * This kernel is arranged such that each block copies 1 source/destination pair. + * + * @param src_bufs Input source buffers + * @param dst_bufs Destination buffers + * @param buf_info Information on the range of values to be copied for each destination buffer + */ +template +__global__ void copy_partitions(uint8_t const** src_bufs, + uint8_t* user_buffer, + dst_buf_info* buf_info) +{ + auto const buf_index = blockIdx.x; + auto const src_buf_index = buf_info[buf_index].src_buf_index; + + // copy, shifting offsets and validity bits as needed + copy_buffer( + user_buffer + buf_info[buf_index].dst_offset, + src_bufs[src_buf_index], + threadIdx.x, + buf_info[buf_index].num_elements, + buf_info[buf_index].element_size, + buf_info[buf_index].src_element_index, + blockDim.x, + buf_info[buf_index].value_shift, + buf_info[buf_index].bit_shift, + buf_info[buf_index].num_rows, + buf_info[buf_index].valid_count > 0 ? &buf_info[buf_index].valid_count : nullptr); +} + // The block of functions below are all related: // // compute_offset_stack_size() @@ -336,7 +373,7 @@ bool is_offset_type(type_id id) { return (id == type_id::STRING or id == type_id * @param end End of input columns * @param offset_depth Current offset nesting depth * - * @returns Total offset stack size needed for this range of columns. + * @returns Total offset stack size needed for this range of columns */ template std::size_t compute_offset_stack_size(InputIter begin, InputIter end, int offset_depth = 0) @@ -400,7 +437,8 @@ template size_type count_src_bufs(InputIter begin, InputIter end) { auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) { - return 1 + (col.nullable() ? 1 : 0) + count_src_bufs(col.child_begin(), col.child_end()); + auto children_counts = count_src_bufs(col.child_begin(), col.child_end()); + return 1 + (col.nullable() ? 1 : 0) + children_counts; }); return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0); } @@ -661,6 +699,66 @@ std::pair setup_source_buf_info(InputIter begin, return {current, offset_stack_pos}; } +/** + * @brief Given a column, processed split buffers, and a metadata builder, populate + * the metadata for this column in the builder, and return a tuple of: + * column size, data offset, bitmask offset and null count. + * + * @param src column_view to create metadata from + * @param current_info dst_buf_info pointer reference, pointing to this column's buffer info + * This is a pointer reference because it is updated by this function as the + * columns's validity and data buffers are visited + * @param mb A metadata_builder instance to update with the column's packed metadata + * @param use_src_null_count True for the chunked_pack case where current_info has invalid null + * count information. The null count should be taken + * from `src` because this case is restricted to a single partition + * (no splits) + * @returns a std::tuple containing: + * column size, data offset, bitmask offset, and null count + */ +template +std::tuple build_output_column_metadata( + column_view const& src, + BufInfo& current_info, + detail::metadata_builder& mb, + bool use_src_null_count) +{ + auto [bitmask_offset, null_count] = [&]() { + if (src.nullable()) { + // offsets in the existing serialized_column metadata are int64_t + // that's the reason for the casting in this code. + int64_t const bitmask_offset = + current_info->num_elements == 0 + ? -1 // this means that the bitmask buffer pointer should be nullptr + : static_cast(current_info->dst_offset); + + // use_src_null_count is used for the chunked contig split case, where we have + // no splits: the null_count is just the source column's null_count + size_type const null_count = use_src_null_count + ? src.null_count() + : (current_info->num_elements == 0 + ? 0 + : (current_info->num_rows - current_info->valid_count)); + + ++current_info; + return std::pair(bitmask_offset, null_count); + } + return std::pair(static_cast(-1), 0); + }(); + + // size/data pointer for the column + auto const col_size = static_cast(current_info->num_elements); + int64_t const data_offset = src.num_children() > 0 || col_size == 0 || src.head() == nullptr + ? -1 + : static_cast(current_info->dst_offset); + + mb.add_column_info_to_meta( + src.type(), col_size, null_count, data_offset, bitmask_offset, src.num_children()); + + ++current_info; + return std::make_tuple(col_size, data_offset, bitmask_offset, null_count); +} + /** * @brief Given a set of input columns and processed split buffers, produce * output columns. @@ -678,6 +776,7 @@ std::pair setup_source_buf_info(InputIter begin, * copied buffer * @param out_begin Output iterator of column views * @param base_ptr Pointer to the base address of copied data for the working partition + * @param mb packed column metadata builder * * @returns new dst_buf_info iterator after processing this range of input columns */ @@ -686,39 +785,72 @@ BufInfo build_output_columns(InputIter begin, InputIter end, BufInfo info_begin, Output out_begin, - uint8_t const* const base_ptr) + uint8_t const* const base_ptr, + detail::metadata_builder& mb) { auto current_info = info_begin; - std::transform(begin, end, out_begin, [¤t_info, base_ptr](column_view const& src) { - auto [bitmask_ptr, null_count] = [&]() { - if (src.nullable()) { - auto const ptr = - current_info->num_elements == 0 - ? nullptr - : reinterpret_cast(base_ptr + current_info->dst_offset); - auto const null_count = current_info->num_elements == 0 - ? 0 - : (current_info->num_rows - current_info->valid_count); - ++current_info; - return std::pair(ptr, null_count); - } - return std::pair(static_cast(nullptr), 0); - }(); + std::transform(begin, end, out_begin, [¤t_info, base_ptr, &mb](column_view const& src) { + size_type col_size, null_count; + int64_t bitmask_offset; + int64_t data_offset; + std::tie(col_size, data_offset, bitmask_offset, null_count) = + build_output_column_metadata(src, current_info, mb, false); + + auto bitmask_ptr = + base_ptr != nullptr && bitmask_offset != -1 + ? reinterpret_cast(base_ptr + static_cast(bitmask_offset)) + : nullptr; // size/data pointer for the column - auto const size = current_info->num_elements; - uint8_t const* data_ptr = - size == 0 || src.head() == nullptr ? nullptr : base_ptr + current_info->dst_offset; - ++current_info; + uint8_t const* data_ptr = base_ptr != nullptr && data_offset != -1 + ? base_ptr + static_cast(data_offset) + : nullptr; // children auto children = std::vector{}; children.reserve(src.num_children()); current_info = build_output_columns( - src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr); + src.child_begin(), src.child_end(), current_info, std::back_inserter(children), base_ptr, mb); + + return column_view{ + src.type(), col_size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)}; + }); + + return current_info; +} + +/** + * @brief Given a set of input columns, processed split buffers, and a metadata_builder, + * append column metadata using the builder. + * + * After performing the split we are left with 1 large buffer per incoming split + * partition. We need to traverse this buffer and distribute the individual + * subpieces that represent individual columns and children to produce the final + * output columns. + * + * This function is called recursively in the case of nested types. + * + * @param begin Beginning of input columns + * @param end End of input columns + * @param info_begin Iterator of dst_buf_info structs containing information about each + * copied buffer + * @param mb packed column metadata builder + * + * @returns new dst_buf_info iterator after processing this range of input columns + */ +template +BufInfo populate_metadata(InputIter begin, + InputIter end, + BufInfo info_begin, + detail::metadata_builder& mb) +{ + auto current_info = info_begin; + std::for_each(begin, end, [¤t_info, &mb](column_view const& src) { + build_output_column_metadata(src, current_info, mb, true); - return column_view{src.type(), size, data_ptr, bitmask_ptr, null_count, 0, std::move(children)}; + // children + current_info = populate_metadata(src.child_begin(), src.child_end(), current_info, mb); }); return current_info; @@ -739,8 +871,8 @@ struct buf_size_functor { * The key is simply the partition index. */ struct split_key_functor { - int num_columns; - int operator() __device__(int buf_index) { return buf_index / num_columns; } + int num_src_bufs; + int operator() __device__(int buf_index) { return buf_index / num_src_bufs; } }; /** @@ -813,282 +945,278 @@ struct size_of_helper { }; /** - * @brief Functor for returning the number of chunks an input buffer is being + * @brief Functor for returning the number of batches an input buffer is being * subdivided into during the repartitioning step. * * Note: columns types which themselves inherently have no data (strings, lists, * structs) return 0. */ -struct num_chunks_func { - thrust::pair const* chunks; - __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(chunks[i]); } +struct num_batches_func { + thrust::pair const* batches; + __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(batches[i]); } }; -void copy_data(int num_bufs, - int num_src_bufs, - uint8_t const** d_src_bufs, - uint8_t** d_dst_bufs, - dst_buf_info* _d_dst_buf_info, - rmm::cuda_stream_view stream) -{ - // Since we parallelize at one block per copy, we are vulnerable to situations where we - // have small numbers of copies to do (a combination of small numbers of splits and/or columns), - // so we will take the actual set of outgoing source/destination buffers and further partition - // them into much smaller chunks in order to drive up the number of blocks and overall occupancy. - auto const desired_chunk_size = std::size_t{1 * 1024 * 1024}; - rmm::device_uvector> chunks(num_bufs, stream); - thrust::transform( - rmm::exec_policy(stream), - _d_dst_buf_info, - _d_dst_buf_info + num_bufs, - chunks.begin(), - [desired_chunk_size] __device__( - dst_buf_info const& buf) -> thrust::pair { - // Total bytes for this incoming partition +/** + * @brief Get the size in bytes of a batch described by `dst_buf_info`. + */ +struct batch_byte_size_function { + size_type num_batches; + dst_buf_info const* infos; + __device__ std::size_t operator()(size_type i) const + { + if (i == num_batches) { + return 0; + } else { + auto& buf = *(infos + i); std::size_t const bytes = static_cast(buf.num_elements) * static_cast(buf.element_size); + return util::round_up_unsafe(bytes, split_align); + } + } +}; - // This clause handles nested data types (e.g. list or string) that store no data in the row - // columns, only in their children. - if (bytes == 0) { return {1, 0}; } +/** + * @brief Get the input buffer index given the output buffer index. + */ +struct out_to_in_index_function { + offset_type const* batch_offsets; + int num_bufs; + __device__ int operator()(size_type i) const + { + return static_cast( + thrust::upper_bound(thrust::seq, batch_offsets, batch_offsets + num_bufs + 1, i) - + batch_offsets) - + 1; + } +}; - // The number of chunks we want to subdivide this buffer into - std::size_t const num_chunks = - max(std::size_t{1}, util::round_up_unsafe(bytes, desired_chunk_size) / desired_chunk_size); +}; // anonymous namespace - // NOTE: leaving chunk size as a separate parameter for future tuning - // possibilities, even though in the current implementation it will be a - // constant. - return {num_chunks, desired_chunk_size}; - }); +namespace detail { - rmm::device_uvector chunk_offsets(num_bufs + 1, stream); - auto buf_count_iter = cudf::detail::make_counting_transform_iterator( - 0, [num_bufs, num_chunks = num_chunks_func{chunks.begin()}] __device__(size_type i) { - return i == num_bufs ? 0 : num_chunks(i); - }); - thrust::exclusive_scan(rmm::exec_policy(stream), - buf_count_iter, - buf_count_iter + num_bufs + 1, - chunk_offsets.begin(), - 0); +// packed block of memory 1: split indices and src_buf_info structs +struct packed_split_indices_and_src_buf_info { + explicit packed_split_indices_and_src_buf_info(cudf::table_view const& input, + std::vector const& splits, + std::size_t num_partitions, + cudf::size_type num_src_bufs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + indices_size = cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align); + src_buf_info_size = cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align); + + // host-side + h_indices_and_source_info = std::vector(indices_size + src_buf_info_size); + h_indices = reinterpret_cast(h_indices_and_source_info.data()); + h_src_buf_info = + reinterpret_cast(h_indices_and_source_info.data() + indices_size); + + // compute splits -> indices. + // these are row numbers per split + h_indices[0] = 0; + h_indices[num_partitions] = input.column(0).size(); + std::copy(splits.begin(), splits.end(), std::next(h_indices)); + + // setup source buf info + setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info, stream); + + offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end()); + offset_stack_size = offset_stack_partition_size * num_partitions * sizeof(size_type); + // device-side + // gpu-only : stack space needed for nested list offset calculation + d_indices_and_source_info = + rmm::device_buffer(indices_size + src_buf_info_size + offset_stack_size, stream, mr); + d_indices = reinterpret_cast(d_indices_and_source_info.data()); + d_src_buf_info = reinterpret_cast( + reinterpret_cast(d_indices_and_source_info.data()) + indices_size); + d_offset_stack = + reinterpret_cast(reinterpret_cast(d_indices_and_source_info.data()) + + indices_size + src_buf_info_size); + + CUDF_CUDA_TRY(cudaMemcpyAsync( + d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value())); + } - auto out_to_in_index = [chunk_offsets = chunk_offsets.begin(), num_bufs] __device__(size_type i) { - return static_cast( - thrust::upper_bound(thrust::seq, chunk_offsets, chunk_offsets + num_bufs + 1, i) - - chunk_offsets) - - 1; - }; - - // apply the chunking. - auto const num_chunks = - cudf::detail::make_counting_transform_iterator(0, num_chunks_func{chunks.begin()}); - size_type const new_buf_count = - thrust::reduce(rmm::exec_policy(stream), num_chunks, num_chunks + chunks.size()); - rmm::device_uvector d_dst_buf_info(new_buf_count, stream); - auto iter = thrust::make_counting_iterator(0); - thrust::for_each( - rmm::exec_policy(stream), - iter, - iter + new_buf_count, - [_d_dst_buf_info, - d_dst_buf_info = d_dst_buf_info.begin(), - chunks = chunks.begin(), - chunk_offsets = chunk_offsets.begin(), - num_bufs, - num_src_bufs, - out_to_in_index] __device__(size_type i) { - size_type const in_buf_index = out_to_in_index(i); - size_type const chunk_index = i - chunk_offsets[in_buf_index]; - auto const chunk_size = thrust::get<1>(chunks[in_buf_index]); - dst_buf_info const& in = _d_dst_buf_info[in_buf_index]; + size_type indices_size; + std::size_t src_buf_info_size; + std::size_t offset_stack_size; - // adjust info - dst_buf_info& out = d_dst_buf_info[i]; - out.element_size = in.element_size; - out.value_shift = in.value_shift; - out.bit_shift = in.bit_shift; - out.valid_count = - in.valid_count; // valid count will be set to 1 if this is a validity buffer - out.src_buf_index = in.src_buf_index; - out.dst_buf_index = in.dst_buf_index; + std::vector h_indices_and_source_info; + rmm::device_buffer d_indices_and_source_info; - size_type const elements_per_chunk = - out.element_size == 0 ? 0 : chunk_size / out.element_size; - out.num_elements = ((chunk_index + 1) * elements_per_chunk) > in.num_elements - ? in.num_elements - (chunk_index * elements_per_chunk) - : elements_per_chunk; + size_type* h_indices; + src_buf_info* h_src_buf_info; - size_type const rows_per_chunk = - // if this is a validity buffer, each element is a bitmask_type, which - // corresponds to 32 rows. - out.valid_count > 0 - ? elements_per_chunk * static_cast(detail::size_in_bits()) - : elements_per_chunk; - out.num_rows = ((chunk_index + 1) * rows_per_chunk) > in.num_rows - ? in.num_rows - (chunk_index * rows_per_chunk) - : rows_per_chunk; + int offset_stack_partition_size; + size_type* d_indices; + src_buf_info* d_src_buf_info; + size_type* d_offset_stack; +}; - out.src_element_index = in.src_element_index + (chunk_index * elements_per_chunk); - out.dst_offset = in.dst_offset + (chunk_index * chunk_size); +// packed block of memory 2: partition buffer sizes and dst_buf_info structs +struct packed_partition_buf_size_and_dst_buf_info { + packed_partition_buf_size_and_dst_buf_info(cudf::table_view const& input, + std::vector const& splits, + std::size_t num_partitions, + cudf::size_type num_src_bufs, + std::size_t num_bufs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : stream(stream) + { + buf_sizes_size = cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align); + dst_buf_info_size = cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align); + // host-side + h_buf_sizes_and_dst_info = std::vector(buf_sizes_size + dst_buf_info_size); + h_buf_sizes = reinterpret_cast(h_buf_sizes_and_dst_info.data()); + h_dst_buf_info = + reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size); + + // device-side + d_buf_sizes_and_dst_info = rmm::device_buffer(buf_sizes_size + dst_buf_info_size, stream, mr); + d_buf_sizes = reinterpret_cast(d_buf_sizes_and_dst_info.data()); + + //// destination buffer info + d_dst_buf_info = reinterpret_cast( + static_cast(d_buf_sizes_and_dst_info.data()) + buf_sizes_size); + } - // out.bytes and out.buf_size are unneeded here because they are only used to - // calculate real output buffer sizes. the data we are generating here is - // purely intermediate for the purposes of doing more uniform copying of data - // underneath the final structure of the output - }); + void copy_to_host() + { + // DtoH buf sizes and col info back to the host + CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes, + d_buf_sizes, + buf_sizes_size + dst_buf_info_size, + cudaMemcpyDefault, + stream.value())); + } - // perform the copy - constexpr size_type block_size = 256; - copy_partitions<<>>( - d_src_bufs, d_dst_bufs, d_dst_buf_info.data()); - - // postprocess valid_counts - auto keys = cudf::detail::make_counting_transform_iterator( - 0, [out_to_in_index] __device__(size_type i) { return out_to_in_index(i); }); - auto values = thrust::make_transform_iterator( - d_dst_buf_info.begin(), [] __device__(dst_buf_info const& info) { return info.valid_count; }); - thrust::reduce_by_key(rmm::exec_policy(stream), - keys, - keys + new_buf_count, - values, - thrust::make_discard_iterator(), - dst_valid_count_output_iterator{_d_dst_buf_info}); -} + rmm::cuda_stream_view stream; -}; // anonymous namespace + // buffer sizes and destination info (used in batched copies) + std::size_t buf_sizes_size; + std::size_t dst_buf_info_size; -namespace detail { + std::vector h_buf_sizes_and_dst_info; + std::size_t* h_buf_sizes; + dst_buf_info* h_dst_buf_info; -std::vector contiguous_split(cudf::table_view const& input, - std::vector const& splits, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - if (input.num_columns() == 0) { return {}; } - if (splits.size() > 0) { - CUDF_EXPECTS(splits.back() <= input.column(0).size(), - "splits can't exceed size of input columns"); + rmm::device_buffer d_buf_sizes_and_dst_info; + std::size_t* d_buf_sizes; + dst_buf_info* d_dst_buf_info; +}; + +// Packed block of memory 3: +// Pointers to source and destination buffers (and stack space on the +// gpu for offset computation) +struct packed_src_and_dst_pointers { + packed_src_and_dst_pointers(cudf::table_view const& input, + std::size_t num_partitions, + cudf::size_type num_src_bufs, + int num_iterations, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : stream(stream) + { + src_bufs_size = cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align); + + dst_bufs_size = cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align); + + // host-side + h_src_and_dst_buffers = std::vector(src_bufs_size + dst_bufs_size); + h_src_bufs = reinterpret_cast(h_src_and_dst_buffers.data()); + h_dst_bufs = reinterpret_cast(h_src_and_dst_buffers.data() + src_bufs_size); + + // device-side + d_src_and_dst_buffers = rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr); + d_src_bufs = reinterpret_cast(d_src_and_dst_buffers.data()); + d_dst_bufs = reinterpret_cast( + reinterpret_cast(d_src_and_dst_buffers.data()) + src_bufs_size); + + // setup src buffers + setup_src_buf_data(input.begin(), input.end(), h_src_bufs); } + + void copy_to_device() { - size_type begin = 0; - for (std::size_t i = 0; i < splits.size(); i++) { - size_type end = splits[i]; - CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); - CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); - CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds."); - begin = end; - } + CUDF_CUDA_TRY(cudaMemcpyAsync(d_src_and_dst_buffers.data(), + h_src_and_dst_buffers.data(), + src_bufs_size + dst_bufs_size, + cudaMemcpyDefault, + stream.value())); } - std::size_t const num_partitions = splits.size() + 1; - std::size_t const num_root_columns = input.num_columns(); + const rmm::cuda_stream_view stream; - // if inputs are empty, just return num_partitions empty tables - if (input.column(0).size() == 0) { - // sanitize the inputs (to handle corner cases like sliced tables) - std::vector> empty_columns; - empty_columns.reserve(input.num_columns()); - std::transform( - input.begin(), input.end(), std::back_inserter(empty_columns), [](column_view const& col) { - return cudf::empty_like(col); - }); - std::vector empty_column_views; - empty_column_views.reserve(input.num_columns()); - std::transform(empty_columns.begin(), - empty_columns.end(), - std::back_inserter(empty_column_views), - [](std::unique_ptr const& col) { return col->view(); }); - table_view empty_inputs(empty_column_views); + std::vector h_src_and_dst_buffers; + rmm::device_buffer d_src_and_dst_buffers; + std::size_t src_bufs_size; + std::size_t dst_bufs_size; + const uint8_t** h_src_bufs; + const uint8_t** d_src_bufs; + uint8_t** h_dst_bufs; + uint8_t** d_dst_bufs; +}; - // build the empty results - std::vector result; - result.reserve(num_partitions); - auto iter = thrust::make_counting_iterator(0); - std::transform(iter, - iter + num_partitions, - std::back_inserter(result), - [&empty_inputs](int partition_index) { - return packed_table{ - empty_inputs, - packed_columns{std::make_unique>(pack_metadata( - empty_inputs, static_cast(nullptr), 0)), - std::make_unique()}}; - }); +std::unique_ptr setup_src_and_dst_pointers( + cudf::table_view const& input, + std::size_t num_partitions, + cudf::size_type num_src_bufs, + int num_iterations, + std::vector& out_buffers, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto src_and_dst_pointers = std::make_unique( + input, num_partitions, num_src_bufs, num_iterations, stream, mr); - return result; - } + std::transform( + out_buffers.begin(), out_buffers.end(), src_and_dst_pointers->h_dst_bufs, [](auto& buf) { + return static_cast(buf.data()); + }); + + src_and_dst_pointers->copy_to_device(); - // compute # of source buffers (column data, validity, children), # of partitions - // and total # of buffers - size_type const num_src_bufs = count_src_bufs(input.begin(), input.end()); - std::size_t const num_bufs = num_src_bufs * num_partitions; - - // packed block of memory 1. split indices and src_buf_info structs - std::size_t const indices_size = - cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align); - std::size_t const src_buf_info_size = - cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align); - // host-side - std::vector h_indices_and_source_info(indices_size + src_buf_info_size); - size_type* h_indices = reinterpret_cast(h_indices_and_source_info.data()); - src_buf_info* h_src_buf_info = - reinterpret_cast(h_indices_and_source_info.data() + indices_size); - // device-side - // gpu-only : stack space needed for nested list offset calculation - int const offset_stack_partition_size = compute_offset_stack_size(input.begin(), input.end()); - std::size_t const offset_stack_size = - offset_stack_partition_size * num_partitions * sizeof(size_type); - rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size, - stream, - rmm::mr::get_current_device_resource()); - auto* d_indices = reinterpret_cast(d_indices_and_source_info.data()); - src_buf_info* d_src_buf_info = reinterpret_cast( - reinterpret_cast(d_indices_and_source_info.data()) + indices_size); - size_type* d_offset_stack = - reinterpret_cast(reinterpret_cast(d_indices_and_source_info.data()) + - indices_size + src_buf_info_size); - - // compute splits -> indices. - h_indices[0] = 0; - h_indices[num_partitions] = input.column(0).size(); - std::copy(splits.begin(), splits.end(), std::next(h_indices)); - - // setup source buf info - setup_source_buf_info(input.begin(), input.end(), h_src_buf_info, h_src_buf_info, stream); - - // HtoD indices and source buf info to device - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value())); - - // packed block of memory 2. partition buffer sizes and dst_buf_info structs - std::size_t const buf_sizes_size = - cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align); - std::size_t const dst_buf_info_size = - cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align); - // host-side - std::vector h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size); - std::size_t* h_buf_sizes = reinterpret_cast(h_buf_sizes_and_dst_info.data()); - dst_buf_info* h_dst_buf_info = - reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size); - // device-side - rmm::device_buffer d_buf_sizes_and_dst_info( - buf_sizes_size + dst_buf_info_size, stream, rmm::mr::get_current_device_resource()); - std::size_t* d_buf_sizes = reinterpret_cast(d_buf_sizes_and_dst_info.data()); - dst_buf_info* d_dst_buf_info = reinterpret_cast( - static_cast(d_buf_sizes_and_dst_info.data()) + buf_sizes_size); + return src_and_dst_pointers; +} + +std::unique_ptr compute_splits( + cudf::table_view const& input, + std::vector const& splits, + std::size_t num_partitions, + cudf::size_type num_src_bufs, + std::size_t num_bufs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto partition_buf_size_and_dst_buf_info = + std::make_unique( + input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr); + + auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; + auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; + auto d_buf_sizes = partition_buf_size_and_dst_buf_info->d_buf_sizes; + + auto split_indices_and_src_buf_info = + packed_split_indices_and_src_buf_info(input, splits, num_partitions, num_src_bufs, stream, mr); + + auto const d_src_buf_info = split_indices_and_src_buf_info.d_src_buf_info; + auto offset_stack_partition_size = split_indices_and_src_buf_info.offset_stack_partition_size; + auto d_offset_stack = split_indices_and_src_buf_info.d_offset_stack; + auto d_indices = split_indices_and_src_buf_info.d_indices; // compute sizes of each column in each partition, including alignment. thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy(stream, mr), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_bufs), d_dst_buf_info, - [num_src_bufs, - d_indices, - d_src_buf_info, + [d_src_buf_info, + offset_stack_partition_size, d_offset_stack, - offset_stack_partition_size] __device__(std::size_t t) { + d_indices, + num_src_bufs] __device__(std::size_t t) { int const split_index = t / num_src_bufs; int const src_buf_index = t % num_src_bufs; auto const& src_info = d_src_buf_info[src_buf_index]; @@ -1158,14 +1286,14 @@ std::vector contiguous_split(cudf::table_view const& input, }); // compute total size of each partition + // key is the split index { - // key is split index auto keys = cudf::detail::make_counting_transform_iterator( 0, split_key_functor{static_cast(num_src_bufs)}); auto values = cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info}); - thrust::reduce_by_key(rmm::exec_policy(stream), + thrust::reduce_by_key(rmm::exec_policy(stream, mr), keys, keys + num_bufs, values, @@ -1173,14 +1301,14 @@ std::vector contiguous_split(cudf::table_view const& input, d_buf_sizes); } - // compute start offset for each output buffer + // compute start offset for each output buffer for each split { auto keys = cudf::detail::make_counting_transform_iterator( 0, split_key_functor{static_cast(num_src_bufs)}); auto values = cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info}); - thrust::exclusive_scan_by_key(rmm::exec_policy(stream), + thrust::exclusive_scan_by_key(rmm::exec_policy(stream, mr), keys, keys + num_bufs, values, @@ -1188,86 +1316,672 @@ std::vector contiguous_split(cudf::table_view const& input, std::size_t{0}); } - // DtoH buf sizes and col info back to the host - CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes, - d_buf_sizes, - buf_sizes_size + dst_buf_info_size, - cudaMemcpyDefault, - stream.value())); + partition_buf_size_and_dst_buf_info->copy_to_host(); + stream.synchronize(); - // allocate output partition buffers - std::vector out_buffers; - out_buffers.reserve(num_partitions); - std::transform(h_buf_sizes, - h_buf_sizes + num_partitions, - std::back_inserter(out_buffers), - [stream, mr](std::size_t bytes) { - return rmm::device_buffer{bytes, stream, mr}; - }); - - // packed block of memory 3. pointers to source and destination buffers (and stack space on the - // gpu for offset computation) - std::size_t const src_bufs_size = - cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align); - std::size_t const dst_bufs_size = - cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align); - // host-side - std::vector h_src_and_dst_buffers(src_bufs_size + dst_bufs_size); - uint8_t const** h_src_bufs = reinterpret_cast(h_src_and_dst_buffers.data()); - uint8_t** h_dst_bufs = reinterpret_cast(h_src_and_dst_buffers.data() + src_bufs_size); - // device-side - rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size, - stream, - rmm::mr::get_current_device_resource()); - auto const** d_src_bufs = reinterpret_cast(d_src_and_dst_buffers.data()); - uint8_t** d_dst_bufs = reinterpret_cast( - reinterpret_cast(d_src_and_dst_buffers.data()) + src_bufs_size); - - // setup src buffers - setup_src_buf_data(input.begin(), input.end(), h_src_bufs); - - // setup dst buffers - std::transform(out_buffers.begin(), out_buffers.end(), h_dst_bufs, [](auto& buf) { - return static_cast(buf.data()); - }); + return partition_buf_size_and_dst_buf_info; +} +/** + * @brief Struct containing information about the actual batches we will send to the + * `copy_partitions` kernel and the number of iterations we need to carry out this copy. + * + * For the non-chunked contiguous_split case, this contains the batched dst_buf_infos and the + * number of iterations are going to be 1, since the non-chunked case is single pass. + * + * For the chunked_pack case, this also contains the batched dst_buf_infos for all + * iterations in addition to helping keep the state about what batches have been copied so far + * and what are the sizes (in bytes) of each iteration. + */ +struct chunk_iteration_state { + chunk_iteration_state(rmm::device_uvector _d_batched_dst_buf_info, + rmm::device_uvector _d_batch_offsets, + std::vector _h_num_buffs_per_iteration, + std::vector _h_size_of_buffs_per_iteration, + std::size_t total_size) + : num_iterations(_h_num_buffs_per_iteration.size()), + current_iteration(0), + starting_buff(0), + d_batched_dst_buf_info(std::move(_d_batched_dst_buf_info)), + d_batch_offsets(std::move(_d_batch_offsets)), + h_num_buffs_per_iteration(std::move(_h_num_buffs_per_iteration)), + h_size_of_buffs_per_iteration(std::move(_h_size_of_buffs_per_iteration)), + total_size(total_size) + { + } + + std::pair get_current_starting_index_and_buff_count() const + { + CUDF_EXPECTS(current_iteration < num_iterations, + "current_iteration cannot exceed num_iterations"); + auto count_for_current = h_num_buffs_per_iteration[current_iteration]; + return std::make_pair(starting_buff, count_for_current); + } + + std::size_t advance_iteration() + { + CUDF_EXPECTS(current_iteration < num_iterations, + "current_iteration cannot exceed num_iterations"); + std::size_t bytes_copied = h_size_of_buffs_per_iteration[current_iteration]; + starting_buff += h_num_buffs_per_iteration[current_iteration]; + ++current_iteration; + return bytes_copied; + } - // HtoD src and dest buffers - CUDF_CUDA_TRY(cudaMemcpyAsync( - d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyDefault, stream.value())); + bool has_more_copies() const { return current_iteration < num_iterations; } - // perform the copy. - copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream); + rmm::device_uvector d_batched_dst_buf_info; + rmm::device_uvector d_batch_offsets; + std::size_t total_size; + int num_iterations; + int current_iteration; - // DtoH dst info (to retrieve null counts) - CUDF_CUDA_TRY(cudaMemcpyAsync( - h_dst_buf_info, d_dst_buf_info, dst_buf_info_size, cudaMemcpyDefault, stream.value())); + private: + std::size_t starting_buff; + std::vector h_num_buffs_per_iteration; + std::vector h_size_of_buffs_per_iteration; +}; - stream.synchronize(); +std::unique_ptr make_chunk_iteration_state( + rmm::device_uvector> const& batches, + int num_bufs, + dst_buf_info* d_orig_dst_buf_info, + std::size_t const* const h_buf_sizes, + std::size_t num_partitions, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + rmm::device_uvector d_batch_offsets(num_bufs + 1, stream, mr); + + auto buf_count_iter = cudf::detail::make_counting_transform_iterator( + 0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) { + return i == num_bufs ? 0 : num_batches(i); + }); + + thrust::exclusive_scan(rmm::exec_policy(stream, mr), + buf_count_iter, + buf_count_iter + num_bufs + 1, + d_batch_offsets.begin(), + 0); + + auto const num_batches_iter = + cudf::detail::make_counting_transform_iterator(0, num_batches_func{batches.begin()}); + size_type const num_batches = thrust::reduce( + rmm::exec_policy(stream, mr), num_batches_iter, num_batches_iter + batches.size()); + + auto out_to_in_index = out_to_in_index_function{d_batch_offsets.begin(), num_bufs}; + + auto iter = thrust::make_counting_iterator(0); + + // load up the batches as d_dst_buf_info + rmm::device_uvector d_batched_dst_buf_info(num_batches, stream, mr); + + thrust::for_each( + rmm::exec_policy(stream, mr), + iter, + iter + num_batches, + [d_orig_dst_buf_info, + d_batched_dst_buf_info = d_batched_dst_buf_info.begin(), + batches = batches.begin(), + d_batch_offsets = d_batch_offsets.begin(), + out_to_in_index] __device__(size_type i) { + size_type const in_buf_index = out_to_in_index(i); + size_type const batch_index = i - d_batch_offsets[in_buf_index]; + auto const batch_size = thrust::get<1>(batches[in_buf_index]); + dst_buf_info const& in = d_orig_dst_buf_info[in_buf_index]; + + // adjust info + dst_buf_info& out = d_batched_dst_buf_info[i]; + out.element_size = in.element_size; + out.value_shift = in.value_shift; + out.bit_shift = in.bit_shift; + out.valid_count = + in.valid_count; // valid count will be set to 1 if this is a validity buffer + out.src_buf_index = in.src_buf_index; + out.dst_buf_index = in.dst_buf_index; + + size_type const elements_per_batch = + out.element_size == 0 ? 0 : batch_size / out.element_size; + out.num_elements = ((batch_index + 1) * elements_per_batch) > in.num_elements + ? in.num_elements - (batch_index * elements_per_batch) + : elements_per_batch; + + size_type const rows_per_batch = + // if this is a validity buffer, each element is a bitmask_type, which + // corresponds to 32 rows. + out.valid_count > 0 + ? elements_per_batch * static_cast(cudf::detail::size_in_bits()) + : elements_per_batch; + out.num_rows = ((batch_index + 1) * rows_per_batch) > in.num_rows + ? in.num_rows - (batch_index * rows_per_batch) + : rows_per_batch; + + out.src_element_index = in.src_element_index + (batch_index * elements_per_batch); + out.dst_offset = in.dst_offset + (batch_index * batch_size); + + // out.bytes and out.buf_size are unneeded here because they are only used to + // calculate real output buffer sizes. the data we are generating here is + // purely intermediate for the purposes of doing more uniform copying of data + // underneath the final structure of the output + }); + + if (user_buffer_size != 0) { + // copy the batch offsets back to host + std::vector h_offsets(num_batches + 1); + { + rmm::device_uvector offsets(h_offsets.size(), stream, mr); + auto batch_byte_size_iter = cudf::detail::make_counting_transform_iterator( + 0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()}); + + thrust::exclusive_scan(rmm::exec_policy(stream, mr), + batch_byte_size_iter, + batch_byte_size_iter + num_batches + 1, + offsets.begin()); + + CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(), + offsets.data(), + sizeof(std::size_t) * offsets.size(), + cudaMemcpyDefault, + stream.value())); + + // the next part is working on the CPU, so we want to synchronize here + stream.synchronize(); + } + + std::vector num_batches_per_iteration; + std::vector size_of_batches_per_iteration; + std::vector accum_size_per_iteration; + std::size_t accum_size = 0; + { + auto current_offset_it = h_offsets.begin(); + // figure out how many iterations we need, while fitting batches to iterations + // with no more than user_buffer_size bytes worth of batches + while (current_offset_it != h_offsets.end()) { + // next_iteration_it points to the batch right above the boundary (the batch + // that didn't fit). + auto next_iteration_it = + std::lower_bound(current_offset_it, + h_offsets.end(), + // We add the cumulative size + 1 because we want to find what would fit + // within a bounce buffer of user_buffer_size (up to user_buffer_size). + // Since h_offsets is a prefix scan, we add the size we accumulated so + // far so we are looking for the next user_buffer_sized boundary. + user_buffer_size + accum_size + 1); + + // we subtract 1 from the number of batch here because next_iteration_it points + // to the batch that didn't fit, so it's one off. + auto batches_in_iter = std::distance(current_offset_it, next_iteration_it) - 1; + + // to get the amount of bytes in this iteration we get the prefix scan size + // and subtract the cumulative size so far, leaving the bytes belonging to this + // iteration + auto iter_size_bytes = *(current_offset_it + batches_in_iter) - accum_size; + accum_size += iter_size_bytes; + + num_batches_per_iteration.push_back(batches_in_iter); + size_of_batches_per_iteration.push_back(iter_size_bytes); + accum_size_per_iteration.push_back(accum_size); + + if (next_iteration_it == h_offsets.end()) { + break; + } + + current_offset_it += batches_in_iter; + } + } + + // apply changed offset + { + rmm::device_uvector d_accum_size_per_iteration( + accum_size_per_iteration.size(), stream, mr); + + CUDF_CUDA_TRY(cudaMemcpyAsync(d_accum_size_per_iteration.data(), + accum_size_per_iteration.data(), + accum_size_per_iteration.size() * sizeof(std::size_t), + cudaMemcpyDefault, + stream.value())); + + // we want to update the offset of batches for every iteration, except the first one (because + // offsets in the first iteration are all 0 based) + auto num_batches_in_first_iteration = num_batches_per_iteration[0]; + auto iter = thrust::make_counting_iterator(num_batches_in_first_iteration); + auto num_iterations = accum_size_per_iteration.size(); + thrust::for_each( + rmm::exec_policy(stream, mr), + iter, + iter + num_batches - num_batches_in_first_iteration, + [num_iterations, + d_batched_dst_buf_info = d_batched_dst_buf_info.begin(), + d_accum_size_per_iteration = d_accum_size_per_iteration.begin()] __device__(size_type i) { + auto prior_iteration_size = + thrust::upper_bound(thrust::seq, + d_accum_size_per_iteration, + d_accum_size_per_iteration + num_iterations, + d_batched_dst_buf_info[i].dst_offset) - + 1; + d_batched_dst_buf_info[i].dst_offset -= *prior_iteration_size; + }); + } + return std::make_unique(std::move(d_batched_dst_buf_info), + std::move(d_batch_offsets), + std::move(num_batches_per_iteration), + std::move(size_of_batches_per_iteration), + accum_size); + + } else { + // we instantiate an "iteration state" for the regular single pass contiguous_split + // consisting of 1 iteration with all of the batches and totalling `total_size` bytes. + auto total_size = std::reduce(h_buf_sizes, h_buf_sizes + num_partitions); + + // 1 iteration with the whole size + return std::make_unique( + std::move(d_batched_dst_buf_info), + std::move(d_batch_offsets), + std::move(std::vector{static_cast(num_batches)}), + std::move(std::vector{total_size}), + total_size); + } +} - // build the output. - std::vector result; - result.reserve(num_partitions); - std::vector cols; - cols.reserve(num_root_columns); - auto cur_dst_buf_info = h_dst_buf_info; - for (std::size_t idx = 0; idx < num_partitions; idx++) { - // traverse the buffers and build the columns. - cur_dst_buf_info = build_output_columns( - input.begin(), input.end(), cur_dst_buf_info, std::back_inserter(cols), h_dst_bufs[idx]); - - // pack the columns - cudf::table_view t{cols}; - result.push_back(packed_table{ - t, - packed_columns{ - std::make_unique>(cudf::pack_metadata( - t, reinterpret_cast(out_buffers[idx].data()), out_buffers[idx].size())), - std::make_unique(std::move(out_buffers[idx]))}}); - - cols.clear(); +void copy_data(int num_batches_to_copy, + int starting_batch, + uint8_t const** d_src_bufs, + uint8_t** d_dst_bufs, + rmm::device_uvector& d_dst_buf_info, + uint8_t* user_buffer, + rmm::cuda_stream_view stream) +{ + constexpr size_type block_size = 256; + if (user_buffer != nullptr) { + copy_partitions<<>>( + d_src_bufs, user_buffer, d_dst_buf_info.data() + starting_batch); + } else { + copy_partitions<<>>( + d_src_bufs, d_dst_bufs, d_dst_buf_info.data() + starting_batch); } - return result; +} + +/** + * @brief Function that checks an input table_view and splits for specific edge cases. + * + * It will return true if the input is "empty" (no rows or columns), which means + * special handling has to happen in the calling code. + * + * @param input table_view of source table to be split + * @param splits the splits specified by the user, or an empty vector if no splits + * @returns true if the input is empty, false otherwise + */ +bool check_inputs(cudf::table_view const& input, std::vector const& splits) +{ + if (input.num_columns() == 0) { return true; } + if (splits.size() > 0) { + CUDF_EXPECTS(splits.back() <= input.column(0).size(), + "splits can't exceed size of input columns"); + } + { + size_type begin = 0; + for (std::size_t i = 0; i < splits.size(); i++) { + size_type end = splits[i]; + CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); + CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); + CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds."); + begin = end; + } + } + return input.column(0).size() == 0; +} + +/** + * @brief A helper struct containing the state of contiguous_split, whether the caller + * is using the single-pass contiguous_split or chunked_pack. + * + * It exposes an iterator-like pattern where contiguous_split_state::has_next() + * return true when there is work to be done, and false otherwise. + * + * contiguous_split_state::contiguous_split() performs a single-pass contiguous_split + * and is only valid iff contiguous_split_state is instantiated with 0 for the user_buffer_size. + * + * contiguous_split_state::contiguous_split_chunk(device_span) is only valid when + * user_buffer_size > 0. It should be called as long as has_next() returns true. The + * device_span passed to contiguous_split_chunk must be allocated in stream `stream` by + * the user. + * + * None of the methods are thread safe. + */ +struct contiguous_split_state { + static const std::size_t desired_batch_size = 1 * 1024 * 1024; + + contiguous_split_state(cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : contiguous_split_state(input, {}, user_buffer_size, stream, mr) + { + } + + contiguous_split_state(cudf::table_view const& input, + std::vector const& splits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : contiguous_split_state(input, splits, 0, stream, mr) + { + } + + contiguous_split_state(cudf::table_view const& input, + std::vector const& splits, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : input(input), user_buffer_size(user_buffer_size), stream(stream), mr(mr) + { + is_empty = check_inputs(input, splits); + num_partitions = splits.size() + 1; + num_src_bufs = count_src_bufs(input.begin(), input.end()); + num_bufs = num_src_bufs * num_partitions; + + // if the table we are about to contig split is empty, we have special + // handling where metadata is produced and a 0-byte contiguous buffer + // is the result. + if (is_empty) { return; } + + partition_buf_size_and_dst_buf_info = + std::move(compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr)); + + compute_batches(); + + // allocate output partition buffers, in the non-chunked case + if (user_buffer_size == 0) { + out_buffers.reserve(num_partitions); + auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; + std::transform(h_buf_sizes, + h_buf_sizes + num_partitions, + std::back_inserter(out_buffers), + [stream = stream, mr = mr](std::size_t bytes) { + return rmm::device_buffer{bytes, stream, mr}; + }); + } + + src_and_dst_pointers = std::move(setup_src_and_dst_pointers(input, + num_partitions, + num_src_bufs, + chunk_iter_state->num_iterations, + out_buffers, + stream, + mr)); + } + + bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); } + + std::size_t get_total_contiguous_size() const + { + return is_empty ? 0 : chunk_iter_state->total_size; + } + + void compute_batches() + { + // Since we parallelize at one block per copy, we are vulnerable to situations where we + // have small numbers of copies to do (a combination of small numbers of splits and/or columns), + // so we will take the actual set of outgoing source/destination buffers and further partition + // them into much smaller batches in order to drive up the number of blocks and overall + // occupancy. + rmm::device_uvector> batches(num_bufs, stream, mr); + auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; + auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; + auto desired_batch_size = contiguous_split_state::desired_batch_size; + thrust::transform( + rmm::exec_policy(stream, mr), + d_dst_buf_info, + d_dst_buf_info + num_bufs, + batches.begin(), + [desired_batch_size] __device__( + dst_buf_info const& buf) -> thrust::pair { + // Total bytes for this incoming partition + std::size_t const bytes = + static_cast(buf.num_elements) * static_cast(buf.element_size); + + // This clause handles nested data types (e.g. list or string) that store no data in the row + // columns, only in their children. + if (bytes == 0) { return {1, 0}; } + + // The number of batches we want to subdivide this buffer into + std::size_t const num_batches = std::max( + std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size); + + // NOTE: leaving batch size as a separate parameter for future tuning + // possibilities, even though in the current implementation it will be a + // constant. + return {num_batches, desired_batch_size}; + }); + + chunk_iter_state = make_chunk_iteration_state( + batches, num_bufs, d_dst_buf_info, h_buf_sizes, num_partitions, user_buffer_size, stream, mr); + } + + std::vector contiguous_split() + { + CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer"); + if (is_empty || input.num_columns() == 0) { return make_packed_tables(); } + + std::size_t num_batches_total; + std::tie(std::ignore, num_batches_total) = + chunk_iter_state->get_current_starting_index_and_buff_count(); + + // perform the copy. + copy_data(num_batches_total, + 0 /* starting at buffer for single-shot 0*/, + src_and_dst_pointers->d_src_bufs, + src_and_dst_pointers->d_dst_bufs, + chunk_iter_state->d_batched_dst_buf_info, + nullptr, + stream); + + // these "orig" dst_buf_info pointers describe the prior-to-batching destination + // buffers per partition + auto d_orig_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; + auto h_orig_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info; + + // postprocess valid_counts: apply the valid counts computed by copy_data for each + // batch back to the original dst_buf_infos + auto keys = cudf::detail::make_counting_transform_iterator( + 0, out_to_in_index_function{chunk_iter_state->d_batch_offsets.begin(), (int)num_bufs}); + + auto values = thrust::make_transform_iterator( + chunk_iter_state->d_batched_dst_buf_info.begin(), + [] __device__(dst_buf_info const& info) { return info.valid_count; }); + + thrust::reduce_by_key(rmm::exec_policy(stream, mr), + keys, + keys + num_batches_total, + values, + thrust::make_discard_iterator(), + dst_valid_count_output_iterator{d_orig_dst_buf_info}); + + CUDF_CUDA_TRY(cudaMemcpyAsync(h_orig_dst_buf_info, + d_orig_dst_buf_info, + partition_buf_size_and_dst_buf_info->dst_buf_info_size, + cudaMemcpyDefault, + stream.value())); + + stream.synchronize(); + + // not necessary for the non-chunked case, but it makes it so further calls to has_next + // return false, just in case + chunk_iter_state->advance_iteration(); + + return make_packed_tables(); + } + + cudf::size_type contiguous_split_chunk(cudf::device_span const& user_buffer) + { + CUDF_FUNC_RANGE() + CUDF_EXPECTS( + user_buffer.size() == user_buffer_size, + "Cannot use a device span smaller than the output buffer size configured at instantiation!"); + CUDF_EXPECTS(has_next(), "Cannot call contiguous_split_chunk with has_next() == false!"); + + std::size_t starting_batch, num_batches_to_copy; + std::tie(starting_batch, num_batches_to_copy) = + chunk_iter_state->get_current_starting_index_and_buff_count(); + + // perform the copy. + copy_data(num_batches_to_copy, + starting_batch, + src_and_dst_pointers->d_src_bufs, + src_and_dst_pointers->d_dst_bufs, + chunk_iter_state->d_batched_dst_buf_info, + user_buffer.data(), + stream); + + // We do not need to post-process null counts since the null count info is + // taken from the source table in the contiguous_split_chunk case (no splits) + return chunk_iter_state->advance_iteration(); + } + + std::vector make_empty_packed_table() + { + // sanitize the inputs (to handle corner cases like sliced tables) + std::vector> empty_columns; + empty_columns.reserve(input.num_columns()); + std::transform( + input.begin(), input.end(), std::back_inserter(empty_columns), [](column_view const& col) { + return cudf::empty_like(col); + }); + std::vector empty_column_views; + empty_column_views.reserve(input.num_columns()); + std::transform(empty_columns.begin(), + empty_columns.end(), + std::back_inserter(empty_column_views), + [](std::unique_ptr const& col) { return col->view(); }); + table_view empty_inputs(empty_column_views); + + // build the empty results + std::vector result; + result.reserve(num_partitions); + auto iter = thrust::make_counting_iterator(0); + std::transform(iter, + iter + num_partitions, + std::back_inserter(result), + [&empty_inputs](int partition_index) { + return packed_table{ + empty_inputs, + packed_columns{std::make_unique>(pack_metadata( + empty_inputs, static_cast(nullptr), 0)), + std::make_unique()}}; + }); + + return result; + } + + std::unique_ptr> build_packed_column_metadata() + { + CUDF_EXPECTS(num_partitions == 1, "build_packed_column_metadata supported only without splits"); + + if (input.num_columns() == 0) { return std::unique_ptr>(); } + + if (is_empty) { + // this is a bit ugly, but it was done to re-use make_empty_packed_table between the + // regular contiguous_split and chunked_pack cases. + auto empty_packed_tables = std::move(make_empty_packed_table()[0]); + return std::move(empty_packed_tables.data.metadata); + } + + auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info; + auto cur_dst_buf_info = h_dst_buf_info; + metadata_builder mb{input.num_columns()}; + + populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb); + + return std::make_unique>(std::move(mb.build())); + } + + std::vector make_packed_tables() + { + if (input.num_columns() == 0) { return std::vector(); } + if (is_empty) { return make_empty_packed_table(); } + std::vector result; + result.reserve(num_partitions); + std::vector cols; + cols.reserve(input.num_columns()); + + auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info; + auto& h_dst_bufs = src_and_dst_pointers->h_dst_bufs; + + auto cur_dst_buf_info = h_dst_buf_info; + for (std::size_t idx = 0; idx < num_partitions; idx++) { + // traverse the buffers and build the columns. + metadata_builder mb(input.num_columns()); + cur_dst_buf_info = cudf::build_output_columns(input.begin(), + input.end(), + cur_dst_buf_info, + std::back_inserter(cols), + h_dst_bufs[idx], + mb); + + // pack the columns + cudf::table_view t{cols}; + result.push_back(packed_table{ + t, + packed_columns{std::make_unique>(mb.build()), + std::make_unique(std::move(out_buffers[idx]))}}); + + cols.clear(); + } + + return result; + } + + cudf::table_view const input; + rmm::cuda_stream_view stream; + rmm::mr::device_memory_resource* mr; + + std::size_t num_partitions; + + // number of source buffers including children * number of splits + std::size_t num_bufs; + + // number of source buffers including children + size_type num_src_bufs; + + std::unique_ptr partition_buf_size_and_dst_buf_info; + + std::unique_ptr src_and_dst_pointers; + + // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only + bool is_empty; + + // + // State around the chunked pattern + // + + // chunked_pack will 1 or more "chunks" to iterate on, defined in chunk_iter_state + // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration. + std::unique_ptr chunk_iter_state; + + // Two modes are allowed: + // - user provided buffer: as the name implies, the user has provided a buffer that must be at + // least 1MB. + // contiguous_split will behave in a "chunked" mode in this scenario, as it will contiguously + // copy up until the user's buffer size limit, exposing a next() call for the user to invoke. + // Note that in this mode, contig split is not partitioning the original table, it is instead + // only placing cuDF buffers contiguously in the user's bounce buffer. + // + // - single shot contiguous_split (default): when the user doesn't provide their own buffer, + // contiguous_split will allocate a buffer per partition and will place contiguous results in + // each buffer. + // + std::vector out_buffers; + + std::size_t user_buffer_size; +}; + +std::vector contiguous_split(cudf::table_view const& input, + std::vector const& splits, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto state = contiguous_split_state(input, splits, stream, mr); + return state.contiguous_split(); } }; // namespace detail @@ -1280,4 +1994,41 @@ std::vector contiguous_split(cudf::table_view const& input, return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr); } +chunked_pack::chunked_pack(cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + state = std::make_unique(input, user_buffer_size, stream, mr); +} + +// required for the unique_ptr to work with a non-complete type (contiguous_split_state) +chunked_pack::~chunked_pack() = default; + +std::size_t chunked_pack::get_total_contiguous_size() const +{ + return state->get_total_contiguous_size(); +} + +bool chunked_pack::has_next() const { return state->has_next(); } + +std::size_t chunked_pack::next(cudf::device_span const& user_buffer) +{ + return state->contiguous_split_chunk(user_buffer); +} + +std::unique_ptr> chunked_pack::build_metadata() const +{ + return state->build_packed_column_metadata(); +} + +std::unique_ptr make_chunked_pack(cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(user_buffer_size >= detail::contiguous_split_state::desired_batch_size, + "The output buffer size must be at least 1MB in size"); + return std::make_unique(input, user_buffer_size, cudf::get_default_stream(), mr); +} + }; // namespace cudf diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp index bac9aac1886..02d96b62639 100644 --- a/cpp/src/copying/pack.cpp +++ b/cpp/src/copying/pack.cpp @@ -235,6 +235,8 @@ metadata_builder::metadata_builder(size_type const num_root_columns) impl->add_column_info_to_meta(data_type{type_id::EMPTY}, num_root_columns, 0, -1, -1, 0); } +metadata_builder::~metadata_builder() = default; + void metadata_builder::add_column_info_to_meta(data_type const col_type, size_type const col_size, size_type const col_null_count, diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index 5d47a123c58..51c42bd7cf0 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -450,10 +451,8 @@ void split_end_to_size(SplitFunc Split, CompareFunc Compare) } template -void split_empty_table(SplitFunc Split) +void split_empty_table(SplitFunc Split, std::vector const& splits = {2, 5, 6}) { - std::vector splits{2, 5, 9}; - cudf::table src_table{}; auto result = Split(src_table, splits); @@ -530,7 +529,9 @@ void split_negative_value(SplitFunc Split) } template -void split_empty_output_column_value(SplitFunc Split, CompareFunc Compare) +void split_empty_output_column_value(SplitFunc Split, + CompareFunc Compare, + std::vector const& splits = {0, 2, 2}) { cudf::size_type start = 0; cudf::size_type col_size = 10; @@ -540,8 +541,6 @@ void split_empty_output_column_value(SplitFunc Split, CompareFunc Compare) cudf::size_type num_cols = 5; cudf::table src_table = create_fixed_table(num_cols, start, col_size, valids); - std::vector splits{0, 2, 2}; - EXPECT_NO_THROW(Split(src_table, splits)); auto result = Split(src_table, splits); @@ -622,7 +621,9 @@ TEST_F(SplitTableCornerCases, EmptyOutputColumn) } template -void split_string_with_invalids(SplitFunc Split, CompareFunc Compare) +void split_string_with_invalids(SplitFunc Split, + CompareFunc Compare, + std::vector splits = {2, 5, 9}) { auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); @@ -638,8 +639,6 @@ void split_string_with_invalids(SplitFunc Split, CompareFunc Compare) scols.push_back(sw[1].release()); cudf::table src_table(std::move(scols)); - std::vector splits{2, 5, 9}; - std::vector expected = create_expected_string_tables_for_splits(strings, splits, true); @@ -653,7 +652,9 @@ void split_string_with_invalids(SplitFunc Split, CompareFunc Compare) } template -void split_empty_output_strings_column_value(SplitFunc Split, CompareFunc Compare) +void split_empty_output_strings_column_value(SplitFunc Split, + CompareFunc Compare, + std::vector const& splits = {0, 2, 2}) { auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); @@ -671,8 +672,6 @@ void split_empty_output_strings_column_value(SplitFunc Split, CompareFunc Compar cudf::size_type num_cols = 2; - std::vector splits{0, 2, 2}; - EXPECT_NO_THROW(Split(src_table, splits)); auto result = Split(src_table, splits); @@ -759,7 +758,7 @@ struct SplitNestedTypesTest : public cudf::test::BaseFixture {}; // common functions for testing split/contiguous_split template -void split_lists(SplitFunc Split, CompareFunc Compare) +void split_lists(SplitFunc Split, CompareFunc Compare, bool split = true) { using LCW = cudf::test::lists_column_wrapper; @@ -775,22 +774,28 @@ void split_lists(SplitFunc Split, CompareFunc Compare) {-10}, {-100, -200}}; - std::vector splits{0, 1, 4, 5, 6, 9}; - - std::vector> expected; - expected.push_back(LCW{}); - expected.push_back(LCW{{1, 2, 3}}); - expected.push_back(LCW{{4, 5}, {6}, {7, 8}}); - expected.push_back(LCW{{9, 10, 11}}); - expected.push_back(LCW{LCW{}}); - expected.push_back(LCW{LCW{}, {-1, -2, -3, -4, -5}, {-10}}); - expected.push_back(LCW{{-100, -200}}); - - auto result = Split(list, splits); - EXPECT_EQ(expected.size(), result.size()); - - for (unsigned long index = 0; index < result.size(); index++) { - Compare(expected[index], result[index]); + if (split) { + std::vector splits{0, 1, 4, 5, 6, 9}; + + std::vector> expected; + expected.push_back(LCW{}); + expected.push_back(LCW{{1, 2, 3}}); + expected.push_back(LCW{{4, 5}, {6}, {7, 8}}); + expected.push_back(LCW{{9, 10, 11}}); + expected.push_back(LCW{LCW{}}); + expected.push_back(LCW{LCW{}, {-1, -2, -3, -4, -5}, {-10}}); + expected.push_back(LCW{{-100, -200}}); + + auto result = Split(list, splits); + EXPECT_EQ(expected.size(), result.size()); + + for (unsigned long index = 0; index < result.size(); index++) { + Compare(expected[index], result[index]); + } + } else { + auto result = Split(list, {}); + EXPECT_EQ(1, result.size()); + Compare(list, result[0]); } } @@ -803,25 +808,31 @@ void split_lists(SplitFunc Split, CompareFunc Compare) {LCW{}}, {{-10}, {-100, -200}}}; - std::vector splits{1, 3, 4}; - - std::vector> expected; - expected.push_back(LCW{{{1, 2, 3}, {4, 5}}}); - expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {LCW{6}}}); - expected.push_back(LCW{{{7, 8}, {9, 10, 11}, LCW{}}}); - expected.push_back(LCW{{LCW{}, {-1, -2, -3, -4, -5}}, {LCW{}}, {{-10}, {-100, -200}}}); - - auto result = Split(list, splits); - EXPECT_EQ(expected.size(), result.size()); - - for (unsigned long index = 0; index < result.size(); index++) { - Compare(expected[index], result[index]); + if (split) { + std::vector splits{1, 3, 4}; + + std::vector> expected; + expected.push_back(LCW{{{1, 2, 3}, {4, 5}}}); + expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {LCW{6}}}); + expected.push_back(LCW{{{7, 8}, {9, 10, 11}, LCW{}}}); + expected.push_back(LCW{{LCW{}, {-1, -2, -3, -4, -5}}, {LCW{}}, {{-10}, {-100, -200}}}); + + auto result = Split(list, splits); + EXPECT_EQ(expected.size(), result.size()); + + for (unsigned long index = 0; index < result.size(); index++) { + Compare(expected[index], result[index]); + } + } else { + auto result = Split(list, {}); + EXPECT_EQ(1, result.size()); + Compare(list, result[0]); } } } template -void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare) +void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare, bool split = true) { using LCW = cudf::test::lists_column_wrapper; @@ -840,22 +851,28 @@ void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare) {-10}, {{-100, -200}, valids}}; - std::vector splits{0, 1, 4, 5, 6, 9}; - - std::vector> expected; - expected.push_back(LCW{}); - expected.push_back(LCW{{1, 2, 3}}); - expected.push_back(LCW{{4, 5}, {6}, {{7, 8}, valids}}); - expected.push_back(LCW{{9, 10, 11}}); - expected.push_back(LCW{LCW{}}); - expected.push_back(LCW{LCW{}, {{-1, -2, -3, -4, -5}, valids}, {-10}}); - expected.push_back(LCW{{{-100, -200}, valids}}); - - auto result = Split(list, splits); - EXPECT_EQ(expected.size(), result.size()); - - for (unsigned long index = 0; index < result.size(); index++) { - Compare(expected[index], result[index]); + if (split) { + std::vector splits{0, 1, 4, 5, 6, 9}; + + std::vector> expected; + expected.push_back(LCW{}); + expected.push_back(LCW{{1, 2, 3}}); + expected.push_back(LCW{{4, 5}, {6}, {{7, 8}, valids}}); + expected.push_back(LCW{{9, 10, 11}}); + expected.push_back(LCW{LCW{}}); + expected.push_back(LCW{LCW{}, {{-1, -2, -3, -4, -5}, valids}, {-10}}); + expected.push_back(LCW{{{-100, -200}, valids}}); + + auto result = Split(list, splits); + EXPECT_EQ(expected.size(), result.size()); + + for (unsigned long index = 0; index < result.size(); index++) { + Compare(expected[index], result[index]); + } + } else { + auto result = Split(list, {}); + EXPECT_EQ(1, result.size()); + Compare(list, result[0]); } } @@ -868,26 +885,32 @@ void split_lists_with_nulls(SplitFunc Split, CompareFunc Compare) {LCW{}}, {{-10}, {-100, -200}}}; - std::vector splits{1, 3, 4}; - - std::vector> expected; - expected.push_back(LCW{{{{1, 2, 3}, valids}, {4, 5}}}); - expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}}); - expected.push_back(LCW{{{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids}}); - expected.push_back( - LCW{{{LCW{}, {-1, -2, -3, -4, -5}}, valids}, {LCW{}}, {{-10}, {-100, -200}}}); - - auto result = Split(list, splits); - EXPECT_EQ(expected.size(), result.size()); - - for (unsigned long index = 0; index < result.size(); index++) { - Compare(expected[index], result[index]); + if (split) { + std::vector splits{1, 3, 4}; + + std::vector> expected; + expected.push_back(LCW{{{{1, 2, 3}, valids}, {4, 5}}}); + expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}}); + expected.push_back(LCW{{{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids}}); + expected.push_back( + LCW{{{LCW{}, {-1, -2, -3, -4, -5}}, valids}, {LCW{}}, {{-10}, {-100, -200}}}); + + auto result = Split(list, splits); + EXPECT_EQ(expected.size(), result.size()); + + for (unsigned long index = 0; index < result.size(); index++) { + Compare(expected[index], result[index]); + } + } else { + auto result = Split(list, {}); + EXPECT_EQ(1, result.size()); + Compare(list, result[0]); } } } template -void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare) +void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare, bool split = true) { // 1. String "names" column. std::vector names{ @@ -921,7 +944,8 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare) : cudf::test::structs_column_wrapper({names_column, ages_column, is_human_col}); // split - std::vector splits{0, 1, 3, 8}; + std::vector splits; + if (split) { splits = std::vector({0, 1, 3, 8}); } auto result = Split(struct_column, splits); // expected outputs @@ -952,20 +976,26 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare) } template -void split_structs_no_children(SplitFunc Split, CompareFunc Compare) +void split_structs_no_children(SplitFunc Split, CompareFunc Compare, bool split = true) { // no nulls { auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{}); - auto expected = cudf::make_structs_column(2, {}, 0, rmm::device_buffer{}); - - // split - std::vector splits{2}; - auto result = Split(*struct_column, splits); - - EXPECT_EQ(result.size(), 2ul); - Compare(*expected, result[0]); - Compare(*expected, result[1]); + if (split) { + auto expected = cudf::make_structs_column(2, {}, 0, rmm::device_buffer{}); + + // split + std::vector splits{2}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2ul); + Compare(*expected, result[0]); + Compare(*expected, result[1]); + } else { + auto result = Split(*struct_column, {}); + EXPECT_EQ(1, result.size()); + Compare(*struct_column, result[0]); + } } // all nulls @@ -975,33 +1005,45 @@ void split_structs_no_children(SplitFunc Split, CompareFunc Compare) cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end()); auto struct_column = cudf::make_structs_column(4, {}, null_count, std::move(null_mask)); - std::vector expected_validity{false, false}; - std::tie(null_mask, null_count) = - cudf::test::detail::make_null_mask(expected_validity.begin(), expected_validity.end()); - auto expected = cudf::make_structs_column(2, {}, null_count, std::move(null_mask)); - - // split - std::vector splits{2}; - auto result = Split(*struct_column, splits); - - EXPECT_EQ(result.size(), 2ul); - Compare(*expected, result[0]); - Compare(*expected, result[1]); + if (split) { + std::vector expected_validity{false, false}; + std::tie(null_mask, null_count) = + cudf::test::detail::make_null_mask(expected_validity.begin(), expected_validity.end()); + auto expected = cudf::make_structs_column(2, {}, null_count, std::move(null_mask)); + + // split + std::vector splits{2}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2ul); + Compare(*expected, result[0]); + Compare(*expected, result[1]); + } else { + auto result = Split(*struct_column, {}); + EXPECT_EQ(1, result.size()); + Compare(*struct_column, result[0]); + } } // no nulls, empty output column { auto struct_column = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{}); - auto expected0 = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{}); - auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{}); - - // split - std::vector splits{4}; - auto result = Split(*struct_column, splits); - - EXPECT_EQ(result.size(), 2ul); - Compare(*expected0, result[0]); - Compare(*expected1, result[1]); + if (split) { + auto expected0 = cudf::make_structs_column(4, {}, 0, rmm::device_buffer{}); + auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{}); + + // split + std::vector splits{4}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2ul); + Compare(*expected0, result[0]); + Compare(*expected1, result[1]); + } else { + auto result = Split(*struct_column, {}); + EXPECT_EQ(1, result.size()); + Compare(*struct_column, result[0]); + } } // all nulls, empty output column @@ -1011,25 +1053,31 @@ void split_structs_no_children(SplitFunc Split, CompareFunc Compare) cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end()); auto struct_column = cudf::make_structs_column(4, {}, null_count, std::move(null_mask)); - std::vector expected_validity0{false, false, false, false}; - std::tie(null_mask, null_count) = - cudf::test::detail::make_null_mask(expected_validity0.begin(), expected_validity0.end()); - auto expected0 = cudf::make_structs_column(4, {}, null_count, std::move(null_mask)); - - auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{}); - - // split - std::vector splits{4}; - auto result = Split(*struct_column, splits); - - EXPECT_EQ(result.size(), 2ul); - Compare(*expected0, result[0]); - Compare(*expected1, result[1]); + if (split) { + std::vector expected_validity0{false, false, false, false}; + std::tie(null_mask, null_count) = + cudf::test::detail::make_null_mask(expected_validity0.begin(), expected_validity0.end()); + auto expected0 = cudf::make_structs_column(4, {}, null_count, std::move(null_mask)); + + auto expected1 = cudf::make_structs_column(0, {}, 0, rmm::device_buffer{}); + + // split + std::vector splits{4}; + auto result = Split(*struct_column, splits); + + EXPECT_EQ(result.size(), 2ul); + Compare(*expected0, result[0]); + Compare(*expected1, result[1]); + } else { + auto result = Split(*struct_column, {}); + EXPECT_EQ(1, result.size()); + Compare(*struct_column, result[0]); + } } } template -void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare) +void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare, bool split = true) { // Struct> using LCW = cudf::test::lists_column_wrapper; @@ -1064,33 +1112,134 @@ void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare) auto struct_column = cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin()); - // split - std::vector splits{1, 3, 8}; - auto result = Split(struct_column, splits); + if (split) { + std::vector splits{1, 3, 8}; + auto result = Split(struct_column, splits); + // expected results + auto expected_names = create_expected_string_columns_for_splits(names, splits, names_validity); + auto expected_ages = create_expected_columns_for_splits(splits, ages, ages_validity); + std::vector> expected_lists; + expected_lists.push_back(LCW({{{1, 2, 3}, {4}}})); + expected_lists.push_back(LCW({{{-1, -2}, LCW{}}, LCW{}})); + std::vector ex_v{1, 1, 0, 1, 0}; + expected_lists.push_back(LCW({{{10}, {20, 30, 40}, {100, -100}}, + {LCW{}, LCW{}, {8, 9}}, + LCW{}, + {{8}, {10, 9, 8, 7, 6, 5}}, + {{5, 6}, LCW{}, {8}}}, + ex_v.begin())); + expected_lists.push_back(LCW({{LCW{-3, 4, -5}}})); + + auto expected_struct_validity = create_expected_validity(splits, struct_validity); + EXPECT_EQ(expected_names.size(), result.size()); + + for (std::size_t index = 0; index < result.size(); index++) { + auto expected = cudf::test::structs_column_wrapper( + {expected_names[index], expected_ages[index], expected_lists[index]}, + expected_struct_validity[index]); + Compare(expected, result[index]); + } + } else { + auto result = Split(struct_column, {}); + Compare(struct_column, result[0]); + } +} - // expected results - auto expected_names = create_expected_string_columns_for_splits(names, splits, names_validity); - auto expected_ages = create_expected_columns_for_splits(splits, ages, ages_validity); - std::vector> expected_lists; - expected_lists.push_back(LCW({{{1, 2, 3}, {4}}})); - expected_lists.push_back(LCW({{{-1, -2}, LCW{}}, LCW{}})); - std::vector ex_v{1, 1, 0, 1, 0}; - expected_lists.push_back(LCW({{{10}, {20, 30, 40}, {100, -100}}, - {LCW{}, LCW{}, {8, 9}}, - LCW{}, - {{8}, {10, 9, 8, 7, 6, 5}}, - {{5, 6}, LCW{}, {8}}}, - ex_v.begin())); - expected_lists.push_back(LCW({{LCW{-3, 4, -5}}})); +template +void split_nested_list_of_structs(SplitFunc Split, CompareFunc Compare, bool split = true) +{ + // List> + using LCW = cudf::test::lists_column_wrapper; - auto expected_struct_validity = create_expected_validity(splits, struct_validity); - EXPECT_EQ(expected_names.size(), result.size()); + // 1. String "names" column. + std::vector names{"Vimes", + "Carrot", + "Angua", + "Cheery", + "Detritus", + "Slant", + "Fred", + "Todd", + "Kevin", + "Jason", + "Clark", + "Bob", + "Mithun", + "Sameer", + "Tim", + "Mark", + "Herman", + "Will"}; + std::vector names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + cudf::test::strings_column_wrapper names_column(names.begin(), names.end()); - for (std::size_t index = 0; index < result.size(); index++) { - auto expected = cudf::test::structs_column_wrapper( - {expected_names[index], expected_ages[index], expected_lists[index]}, - expected_struct_validity[index]); - Compare(expected, result[index]); + // 2. Numeric "ages" column. + std::vector ages{5, 10, 15, 20, 25, 30, 100, 101, 102, 26, 64, 12, 17, 16, 120, 44, 23, 50}; + std::vector ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0}; + auto ages_column = + cudf::test::fixed_width_column_wrapper(ages.begin(), ages.end(), ages_validity.begin()); + + // 3. List column + std::vector list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1}; + cudf::test::lists_column_wrapper list( + {{"ab", "cd", "ef"}, + LCW{"gh"}, + {"ijk", "lmn"}, + LCW{}, + LCW{"o"}, + {"pqr", "stu", "vwx"}, + {"yz", "aaaa"}, + LCW{"bbbb"}, + {"cccc", "ddd", "eee", "fff", "ggg", "hh"}, + {"b", "cdr", "efh", "um"}, + LCW{"gh", "iu"}, + {"lmn"}, + LCW{"org"}, + LCW{}, + {"stu", "vwx"}, + {"yz", "aaaa", "kem"}, + LCW{"bbbb"}, + {"cccc", "eee", "faff", "jiea", "fff", "ggg", "hh"}}, + list_validity.begin()); + + // Assembly struct column + auto const struct_validity = + std::vector{1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}; + auto struct_column = + cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin()); + + // wrap in a list + std::vector outer_offsets{0, 3, 4, 8, 13, 16, 17, 18}; + cudf::test::fixed_width_column_wrapper outer_offsets_col(outer_offsets.begin(), + outer_offsets.end()); + std::vector outer_validity{1, 1, 1, 0, 1, 1, 0}; + auto [outer_null_mask, outer_null_count] = + cudf::test::detail::make_null_mask(outer_validity.begin(), outer_validity.end()); + auto outer_list = make_lists_column(static_cast(outer_validity.size()), + outer_offsets_col.release(), + struct_column.release(), + outer_null_count, + std::move(outer_null_mask)); + if (split) { + std::vector splits{1, 3, 7}; + cudf::table_view tbl({static_cast(*outer_list)}); + + // we are testing the results of contiguous_split against regular cudf::split, which may seem + // weird. however, cudf::split() is a simple operation that just sets offsets at the topmost + // output column, whereas contiguous_split is a deep copy of the data to contiguous output + // buffers. so as long as we believe the comparison code (expect_columns_equivalent) can compare + // these outputs correctly, this should be safe. + auto result = Split(*outer_list, splits); + auto expected = cudf::split(static_cast(*outer_list), splits); + CUDF_EXPECTS(result.size() == expected.size(), "Split result size mismatch"); + + for (std::size_t index = 0; index < result.size(); index++) { + Compare(expected[index], result[index]); + } + } else { + auto result = Split(*outer_list, {}); + EXPECT_EQ(1, result.size()); + Compare(*outer_list, result[0]); } } @@ -1165,6 +1314,47 @@ TEST_F(SplitNestedTypesTest, StructsOfList) template struct ContiguousSplitTest : public cudf::test::BaseFixture {}; +std::vector do_chunked_pack(cudf::table_view const& input) +{ + auto mr = rmm::mr::get_current_device_resource(); + + rmm::device_buffer bounce_buff(1 * 1024 * 1024, cudf::get_default_stream(), mr); + auto bounce_buff_span = + cudf::device_span(static_cast(bounce_buff.data()), bounce_buff.size()); + + auto chunked_pack = cudf::make_chunked_pack(input, bounce_buff_span.size(), mr); + + // right size the final buffer + rmm::device_buffer final_buff( + chunked_pack->get_total_contiguous_size(), cudf::get_default_stream(), mr); + + std::size_t final_buff_offset = 0; + while (chunked_pack->has_next()) { + auto bytes_copied = chunked_pack->next(bounce_buff_span); + cudaMemcpyAsync((uint8_t*)final_buff.data() + final_buff_offset, + bounce_buff.data(), + bytes_copied, + cudaMemcpyDefault, + cudf::get_default_stream()); + final_buff_offset += bytes_copied; + } + + auto packed_column_metas = chunked_pack->build_metadata(); + // for chunked contig split, this is going to be a size 1 vector if we have + // results, or a size 0 if the original table was empty (no columns) + std::vector result; + if (packed_column_metas) { + result = std::vector(1); + auto pc = cudf::packed_columns(std::move(packed_column_metas), + std::make_unique(std::move(final_buff))); + + auto unpacked = cudf::unpack(pc); + cudf::packed_table pt{std::move(unpacked), std::move(pc)}; + result[0] = std::move(pt); + } + return result; +} + // the various utility functions in slice_tests.cuh don't like the chrono types using FixedWidthTypesWithoutChrono = cudf::test::Concat; @@ -1208,6 +1398,41 @@ TYPED_TEST(ContiguousSplitTest, LongColumn) false); } +TYPED_TEST(ContiguousSplitTest, LongColumnChunked) +{ + split_custom_column( + [](cudf::table_view const& t, std::vector const&) { + return do_chunked_pack(t); + }, + [](cudf::table_view const& expected, cudf::packed_table const& result) { + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(expected.num_columns()), + [&expected, &result](cudf::size_type i) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i), + result.table.column(i)); + }); + }, + 100002, + {}, + true); + + split_custom_column( + [](cudf::table_view const& t, std::vector const&) { + return do_chunked_pack(t); + }, + [](cudf::table_view const& expected, cudf::packed_table const& result) { + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(expected.num_columns()), + [&expected, &result](cudf::size_type i) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i), + result.table.column(i)); + }); + }, + 100002, + {}, + false); +} + TYPED_TEST(ContiguousSplitTest, LongColumnBigSplits) { split_custom_column( @@ -1309,6 +1534,46 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes) } } +TEST_F(ContiguousSplitUntypedTest, ProgressiveSizesChunked) +{ + constexpr int col_size = 4096; + + // stress test copying a wide amount of bytes. + for (int idx = 2048; idx < col_size; idx += 128) { + split_custom_column( + [](cudf::table_view const& t, std::vector const&) { + return do_chunked_pack(t); + }, + [](cudf::table_view const& expected, cudf::packed_table const& result) { + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(expected.num_columns()), + [&expected, &result](cudf::size_type i) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i), + result.table.column(i)); + }); + }, + col_size, + {}, + true); + + split_custom_column( + [](cudf::table_view const& t, std::vector const&) { + return do_chunked_pack(t); + }, + [](cudf::table_view const& expected, cudf::packed_table const& result) { + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(expected.num_columns()), + [&expected, &result](cudf::size_type i) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected.column(i), + result.table.column(i)); + }); + }, + col_size, + {}, + false); + } +} + TEST_F(ContiguousSplitUntypedTest, ValidityRepartition) { // it is tricky to actually get the internal repartitioning/load-balancing code to add new splits @@ -1336,6 +1601,25 @@ TEST_F(ContiguousSplitUntypedTest, ValidityRepartition) } } +TEST_F(ContiguousSplitUntypedTest, ValidityRepartitionChunked) +{ + srand(0); + auto rvalids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return static_cast(rand()) / static_cast(RAND_MAX) < 0.5f ? 0 : 1; + }); + cudf::size_type const num_rows = 2000000; + auto col = cudf::sequence(num_rows, cudf::numeric_scalar{0}); + auto [null_mask, null_count] = cudf::test::detail::make_null_mask(rvalids, rvalids + num_rows); + col->set_null_mask(std::move(null_mask), null_count); + + cudf::table_view t({*col}); + auto result = do_chunked_pack(t); + auto& expected = t; + EXPECT_EQ(1, result.size()); + + CUDF_TEST_EXPECT_TABLES_EQUAL(result[0].table, expected); +} + TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase) { // tests an edge case where the splits cause the final validity data to be copied @@ -1361,6 +1645,16 @@ TEST_F(ContiguousSplitUntypedTest, DISABLED_VeryLargeColumnTest) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, result[0].table.column(0)); } +// This test requires about 25GB of device memory when used with the arena allocator +TEST_F(ContiguousSplitUntypedTest, DISABLED_VeryLargeColumnTestChunked) +{ + // tests an edge case where buf.elements * buf.element_size overflows an INT32. + auto col = cudf::make_fixed_width_column( + cudf::data_type{cudf::type_id::INT64}, 400 * 1024 * 1024, cudf::mask_state::UNALLOCATED); + auto result = do_chunked_pack(cudf::table_view{{*col}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, result[0].table.column(0)); +} + // contiguous split with strings struct ContiguousSplitStringTableTest : public SplitTest {}; @@ -1375,6 +1669,18 @@ TEST_F(ContiguousSplitStringTableTest, StringWithInvalids) }); } +TEST_F(ContiguousSplitStringTableTest, StringWithInvalidsChunked) +{ + split_string_with_invalids( + [](cudf::table_view const& t, std::vector const&) { + return do_chunked_pack(t); + }, + [](cudf::table_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.table); + }, + {}); +} + TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn) { // build a bunch of empty stuff @@ -1397,6 +1703,13 @@ TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table); } + { + auto result = do_chunked_pack(src_table); + CUDF_EXPECTS(result.size() == 1, "Incorrect returned contiguous_split result size!"); + + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[0].table); + } + { std::vector splits{0, 0, 0, 0}; auto result = cudf::contiguous_split(src_table, splits); @@ -1417,6 +1730,16 @@ TEST_F(ContiguousSplitStringTableTest, EmptyOutputColumn) [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); }); } +TEST_F(ContiguousSplitStringTableTest, EmptyOutputColumnChunked) +{ + split_empty_output_strings_column_value( + [](cudf::table_view const& t, std::vector const&) { + return do_chunked_pack(t); + }, + [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); }, + {}); +} + TEST_F(ContiguousSplitStringTableTest, NullStringColumn) { split_null_input_strings_column_value( @@ -1464,6 +1787,13 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyTable) }); } +TEST_F(ContiguousSplitTableCornerCases, EmptyTableChunked) +{ + split_empty_table([](cudf::table_view const& t, + std::vector const&) { return do_chunked_pack(t); }, + {}); +} + TEST_F(ContiguousSplitTableCornerCases, EmptyIndices) { split_empty_indices([](cudf::table_view const& t, std::vector const& splits) { @@ -1501,6 +1831,16 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyOutputColumn) [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); }); } +TEST_F(ContiguousSplitTableCornerCases, EmptyOutputColumnChunked) +{ + split_empty_output_column_value( + [](cudf::table_view const& t, std::vector const&) { + return do_chunked_pack(t); + }, + [](cudf::packed_table const& t, int num_cols) { EXPECT_EQ(t.table.num_columns(), num_cols); }, + {}); +} + TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes) { cudf::size_type start = 0; @@ -1542,6 +1882,70 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes) } } +TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypesChunked) +{ + cudf::size_type start = 0; + auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; }); + + std::size_t num_rows = 1000000; + + std::vector strings1(num_rows); + std::vector strings2(num_rows); + strings1[0] = ""; + strings2[0] = ""; + for (std::size_t i = 1; i < num_rows; ++i) { + auto str = std::to_string(i); + strings1[i] = str; + strings2[i] = str; + } + + std::vector> cols; + + auto iter0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i); }); + auto c0 = cudf::test::fixed_width_column_wrapper(iter0, iter0 + num_rows, valids); + cols.push_back(c0.release()); + + auto iter1 = cudf::detail::make_counting_transform_iterator(10, [](auto i) { return (i); }); + auto c1 = cudf::test::fixed_width_column_wrapper(iter1, iter1 + num_rows, valids); + cols.push_back(c1.release()); + + auto c2 = cudf::test::strings_column_wrapper(strings1.begin(), strings1.end(), valids); + cols.push_back(c2.release()); + + auto c3 = cudf::test::strings_column_wrapper(strings2.begin(), strings2.end(), valids); + cols.push_back(c3.release()); + + auto iter4 = cudf::detail::make_counting_transform_iterator(20, [](auto i) { return (i); }); + auto c4 = cudf::test::fixed_width_column_wrapper(iter4, iter4 + num_rows, valids); + cols.push_back(c4.release()); + + auto tbl = cudf::table(std::move(cols)); + auto results = do_chunked_pack(tbl.view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, results[0].table); +} + +TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypesSingleRowChunked) +{ + cudf::size_type start = 0; + auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; }); + + std::size_t num_rows = 1; + + std::vector> cols; + + auto iter0 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i); }); + auto c0 = cudf::test::fixed_width_column_wrapper(iter0, iter0 + num_rows, valids); + cols.push_back(c0.release()); + + auto iter1 = cudf::detail::make_counting_transform_iterator(1, [](auto i) { return (i); }); + auto c1 = cudf::test::fixed_width_column_wrapper(iter1, iter1 + num_rows); + cols.push_back(c1.release()); + + auto tbl = cudf::table(std::move(cols)); + auto results = do_chunked_pack(tbl.view()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(tbl, results[0].table); +} + TEST_F(ContiguousSplitTableCornerCases, PreSplitTable) { auto valids = @@ -1592,6 +1996,13 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitTable) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected[index], result[index].table); } } + + { + auto result = do_chunked_pack(pre_split[1]); + EXPECT_EQ(1, result.size()); + auto expected = pre_split[1]; + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected, result[0].table); + } } TEST_F(ContiguousSplitTableCornerCases, PreSplitTableLarge) @@ -1766,6 +2177,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty) std::vector splits2({1}); EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + + EXPECT_NO_THROW(do_chunked_pack(src_table)); } // this produces an empty strings column with children that have no data, @@ -1784,6 +2197,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty) std::vector splits2({1}); EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + + EXPECT_NO_THROW(do_chunked_pack(src_table)); } // this produces an empty lists column with children that have no data, @@ -1802,6 +2217,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty) std::vector splits2({1}); EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + + EXPECT_NO_THROW(do_chunked_pack(src_table)); } // this produces an empty lists column with children that have no data, @@ -1820,6 +2237,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty) std::vector splits2({1}); EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + + EXPECT_NO_THROW(do_chunked_pack(src_table)); } // this produces an empty struct column with children that have no data, @@ -1840,6 +2259,8 @@ TEST_F(ContiguousSplitTableCornerCases, NestedEmpty) std::vector splits2({1}); EXPECT_NO_THROW(contiguous_split(src_table, splits2)); + + EXPECT_NO_THROW(do_chunked_pack(src_table)); } } @@ -1868,6 +2289,12 @@ TEST_F(ContiguousSplitTableCornerCases, SplitEmpty) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table); } + { + auto result = do_chunked_pack(sliced[0]); + EXPECT_EQ(1, result.size()); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table); + } + { auto result = cudf::contiguous_split(sliced[0], {0}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced[0], result[0].table); @@ -1878,6 +2305,52 @@ TEST_F(ContiguousSplitTableCornerCases, SplitEmpty) } } +TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall) +{ + // internally, contiguous split chunks GPU work in 1MB contiguous copies + // so the output buffer must be 1MB or larger. + EXPECT_THROW(cudf::make_chunked_pack({}, 1 * 1024, mr()), cudf::logic_error); +} + +TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall) +{ + auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr()); + rmm::device_buffer buff(1 * 1024, cudf::get_default_stream(), mr()); + cudf::device_span too_small(static_cast(buff.data()), buff.size()); + std::size_t copied = 0; + // throws because we created chunked_contig_split with 1MB, but we are giving + // it a 1KB span here + EXPECT_THROW(copied = chunked_pack->next(too_small), cudf::logic_error); + EXPECT_EQ(copied, 0); +} + +TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse) +{ + auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr()); + rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr()); + cudf::device_span bounce_buff(static_cast(buff.data()), buff.size()); + EXPECT_EQ(chunked_pack->has_next(), false); // empty input table + std::size_t copied = 0; + EXPECT_THROW(copied = chunked_pack->next(bounce_buff), cudf::logic_error); + EXPECT_EQ(copied, 0); +} + +TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse) +{ + cudf::test::strings_column_wrapper a{"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx"}; + cudf::table_view t({a}); + rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr()); + cudf::device_span bounce_buff(static_cast(buff.data()), buff.size()); + auto chunked_pack = cudf::make_chunked_pack(t, buff.size(), mr()); + EXPECT_EQ(chunked_pack->has_next(), true); + std::size_t copied = chunked_pack->next(bounce_buff); + EXPECT_EQ(copied, chunked_pack->get_total_contiguous_size()); + EXPECT_EQ(chunked_pack->has_next(), false); + copied = 0; + EXPECT_THROW(copied = chunked_pack->next(bounce_buff), cudf::logic_error); + EXPECT_EQ(copied, 0); +} + struct ContiguousSplitNestedTypesTest : public cudf::test::BaseFixture {}; TEST_F(ContiguousSplitNestedTypesTest, Lists) @@ -1892,6 +2365,19 @@ TEST_F(ContiguousSplitNestedTypesTest, Lists) }); } +TEST_F(ContiguousSplitNestedTypesTest, ListsChunked) +{ + split_lists( + [](cudf::column_view const& c, std::vector const&) { + cudf::table_view t({c}); + return do_chunked_pack(t); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }, + /*split*/ false); +} + TEST_F(ContiguousSplitNestedTypesTest, ListsWithNulls) { split_lists_with_nulls( @@ -1904,6 +2390,19 @@ TEST_F(ContiguousSplitNestedTypesTest, ListsWithNulls) }); } +TEST_F(ContiguousSplitNestedTypesTest, ListsWithNullsChunked) +{ + split_lists_with_nulls( + [](cudf::column_view const& c, std::vector const&) { + cudf::table_view t({c}); + return do_chunked_pack(t); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }, + /*split*/ false); +} + TEST_F(ContiguousSplitNestedTypesTest, Structs) { split_structs( @@ -1917,6 +2416,20 @@ TEST_F(ContiguousSplitNestedTypesTest, Structs) }); } +TEST_F(ContiguousSplitNestedTypesTest, StructsChunked) +{ + split_structs( + false, + [](cudf::column_view const& c, std::vector const&) { + cudf::table_view t({c}); + return do_chunked_pack(t); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }, + /*split*/ false); +} + TEST_F(ContiguousSplitNestedTypesTest, StructsWithNulls) { split_structs( @@ -1930,6 +2443,20 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsWithNulls) }); } +TEST_F(ContiguousSplitNestedTypesTest, StructsWithNullsChunked) +{ + split_structs( + true, + [](cudf::column_view const& c, std::vector const&) { + cudf::table_view t({c}); + return do_chunked_pack(t); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }, + {}); +} + TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildren) { split_structs_no_children( @@ -1942,6 +2469,19 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildren) }); } +TEST_F(ContiguousSplitNestedTypesTest, StructsNoChildrenChunked) +{ + split_structs_no_children( + [](cudf::column_view const& c, std::vector const&) { + cudf::table_view t({c}); + return do_chunked_pack(t); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }, + /*split*/ false); +} + TEST_F(ContiguousSplitNestedTypesTest, StructsOfList) { split_nested_struct_of_list( @@ -1954,95 +2494,40 @@ TEST_F(ContiguousSplitNestedTypesTest, StructsOfList) }); } -TEST_F(ContiguousSplitNestedTypesTest, ListOfStruct) +TEST_F(ContiguousSplitNestedTypesTest, StructsOfListChunked) { - // List> - using LCW = cudf::test::lists_column_wrapper; - - // 1. String "names" column. - std::vector names{"Vimes", - "Carrot", - "Angua", - "Cheery", - "Detritus", - "Slant", - "Fred", - "Todd", - "Kevin", - "Jason", - "Clark", - "Bob", - "Mithun", - "Sameer", - "Tim", - "Mark", - "Herman", - "Will"}; - std::vector names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; - cudf::test::strings_column_wrapper names_column(names.begin(), names.end()); - - // 2. Numeric "ages" column. - std::vector ages{5, 10, 15, 20, 25, 30, 100, 101, 102, 26, 64, 12, 17, 16, 120, 44, 23, 50}; - std::vector ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0}; - auto ages_column = - cudf::test::fixed_width_column_wrapper(ages.begin(), ages.end(), ages_validity.begin()); - - // 3. List column - std::vector list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1}; - cudf::test::lists_column_wrapper list( - {{"ab", "cd", "ef"}, - LCW{"gh"}, - {"ijk", "lmn"}, - LCW{}, - LCW{"o"}, - {"pqr", "stu", "vwx"}, - {"yz", "aaaa"}, - LCW{"bbbb"}, - {"cccc", "ddd", "eee", "fff", "ggg", "hh"}, - {"b", "cdr", "efh", "um"}, - LCW{"gh", "iu"}, - {"lmn"}, - LCW{"org"}, - LCW{}, - {"stu", "vwx"}, - {"yz", "aaaa", "kem"}, - LCW{"bbbb"}, - {"cccc", "eee", "faff", "jiea", "fff", "ggg", "hh"}}, - list_validity.begin()); - - // Assembly struct column - auto const struct_validity = - std::vector{1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1}; - auto struct_column = - cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin()); - - // wrap in a list - std::vector outer_offsets{0, 3, 4, 8, 13, 16, 17, 18}; - cudf::test::fixed_width_column_wrapper outer_offsets_col(outer_offsets.begin(), - outer_offsets.end()); - std::vector outer_validity{1, 1, 1, 0, 1, 1, 0}; - auto [outer_null_mask, null_count] = - cudf::test::detail::make_null_mask(outer_validity.begin(), outer_validity.end()); - auto outer_list = make_lists_column(static_cast(outer_validity.size()), - outer_offsets_col.release(), - struct_column.release(), - null_count, - std::move(outer_null_mask)); + split_nested_struct_of_list( + [](cudf::column_view const& c, std::vector const&) { + cudf::table_view t({c}); + return do_chunked_pack(t); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }, + /*split*/ false); +} - // split - std::vector splits{1, 3, 7}; - cudf::table_view tbl({static_cast(*outer_list)}); - - // we are testing the results of contiguous_split against regular cudf::split, which may seem - // weird. however, cudf::split() is a simple operation that just sets offsets at the topmost - // output column, whereas contiguous_split is a deep copy of the data to contiguous output - // buffers. so as long as we believe the comparison code (expect_columns_equivalent) can compare - // these outputs correctly, this should be safe. - auto result = cudf::contiguous_split(tbl, splits); - auto expected = cudf::split(static_cast(*outer_list), splits); - CUDF_EXPECTS(result.size() == expected.size(), "Split result size mismatch"); +TEST_F(ContiguousSplitNestedTypesTest, ListOfStruct) +{ + split_nested_list_of_structs( + [](cudf::column_view const& c, std::vector const& splits) { + cudf::table_view t({c}); + return cudf::contiguous_split(t, splits); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }); +} - for (std::size_t index = 0; index < result.size(); index++) { - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected[index], result[index].table.column(0)); - } +TEST_F(ContiguousSplitNestedTypesTest, ListOfStructChunked) +{ + split_nested_list_of_structs( + [](cudf::column_view const& c, std::vector const&) { + cudf::table_view t({c}); + return do_chunked_pack(t); + }, + [](cudf::column_view const& expected, cudf::packed_table const& result) { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, result.table.column(0)); + }, + /*split*/ false); } From 3ea2844ffa9bbfd3ce756d001eb90af6b91e0243 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 1 May 2023 16:48:05 -0500 Subject: [PATCH 02/31] Fix code style --- cpp/src/copying/contiguous_split.cu | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index e94ce1d8156..4489428b32b 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1523,9 +1523,7 @@ std::unique_ptr make_chunk_iteration_state( size_of_batches_per_iteration.push_back(iter_size_bytes); accum_size_per_iteration.push_back(accum_size); - if (next_iteration_it == h_offsets.end()) { - break; - } + if (next_iteration_it == h_offsets.end()) { break; } current_offset_it += batches_in_iter; } From fe3bd771cff4c60fcbe23e5832a6985478259699 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Sat, 6 May 2023 06:19:07 -0500 Subject: [PATCH 03/31] Const fixes. Structured binding. Use buffer instead of bounce buffer --- cpp/include/cudf/contiguous_split.hpp | 21 ++++++++++++++------- cpp/src/copying/contiguous_split.cu | 12 ++++++------ 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index eacfcfcb803..61edf6a3564 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -154,8 +154,15 @@ struct contiguous_split_state; * * auto stream = cudf::get_default_stream(); * - * // Define a bounce buffer size: the larger the bounce buffer is, the more SMs can be - * // occupied by this algorithm. + * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be + * // occupied by this algorithm. + * // + * // Internally, the GPU unit-of-work is a 1MB batch. When we instantiate `cudf::chunked_pack`, + * // all the 1MB batches for the source table_view are computed up front. Additionally, + * // chunked_pack calculates the number of iterations that are required to go through all those + * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk) + * // equals the number of CUDA blocks that will be used for the actual work. + * // * std::size_t user_buffer_size = 128*1024*1024; * * auto chunked_packer = make_chunked_pack(tv, user_buffer_size, stream, mr); @@ -187,7 +194,7 @@ class chunked_pack { * @brief Construct a `chunked_pack` class. * * @param input source `table_view` to pack - * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be + * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be * at least 1MB * @param stream CUDA stream used for device memory operations and kernel launches * @param mr RMM memory resource to be used for temporary and scratch allocations only @@ -225,8 +232,8 @@ class chunked_pack { * @throws cudf::logic_error If the size of `user_buffer` is different than `user_buffer_size` * @throws cudf::logic_error If called after all chunks have been copied * - * @param user_buffer device span representing a bounce buffer. The size of this span - * must equal the `user_buffer_size` parameter passed at construction + * @param user_buffer device span target for the chunk. The size of this span must equal + * the `user_buffer_size` parameter passed at construction * @return The number of bytes that were written to `user_buffer` (at most * `user_buffer_size`) * @@ -247,13 +254,13 @@ class chunked_pack { /** * @brief Created a `chunked_pack` instance to perform a "pack" of the `table_view` - * "input", where a bounce buffer of `user_buffer_size` is filled with chunks of the + * "input", where a buffer of `user_buffer_size` is filled with chunks of the * overall operation. This operation can be used in cases where GPU memory is constrained. * * @throws cudf::logic_error When user_buffer_size is less than 1MB * * @param input source `table_view` to pack - * @param user_buffer_size bounce buffer size (in bytes) that will be passed on `next`. Must be + * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be * at least 1MB * @param mr RMM memory resource to be used for temporary and scratch allocations only * @return a unique_ptr of chunked_pack diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 4489428b32b..4674321436f 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -437,7 +437,7 @@ template size_type count_src_bufs(InputIter begin, InputIter end) { auto buf_iter = thrust::make_transform_iterator(begin, [](column_view const& col) { - auto children_counts = count_src_bufs(col.child_begin(), col.child_end()); + auto const children_counts = count_src_bufs(col.child_begin(), col.child_end()); return 1 + (col.nullable() ? 1 : 0) + children_counts; }); return std::accumulate(buf_iter, buf_iter + std::distance(begin, end), 0); @@ -793,7 +793,7 @@ BufInfo build_output_columns(InputIter begin, size_type col_size, null_count; int64_t bitmask_offset; int64_t data_offset; - std::tie(col_size, data_offset, bitmask_offset, null_count) = + auto [col_size, data_offset, bitmask_offset, null_count] = build_output_column_metadata(src, current_info, mb, false); auto bitmask_ptr = @@ -1504,7 +1504,7 @@ std::unique_ptr make_chunk_iteration_state( std::lower_bound(current_offset_it, h_offsets.end(), // We add the cumulative size + 1 because we want to find what would fit - // within a bounce buffer of user_buffer_size (up to user_buffer_size). + // within a buffer of user_buffer_size (up to user_buffer_size). // Since h_offsets is a prefix scan, we add the size we accumulated so // far so we are looking for the next user_buffer_sized boundary. user_buffer_size + accum_size + 1); @@ -1570,7 +1570,7 @@ std::unique_ptr make_chunk_iteration_state( } else { // we instantiate an "iteration state" for the regular single pass contiguous_split // consisting of 1 iteration with all of the batches and totalling `total_size` bytes. - auto total_size = std::reduce(h_buf_sizes, h_buf_sizes + num_partitions); + auto const total_size = std::reduce(h_buf_sizes, h_buf_sizes + num_partitions); // 1 iteration with the whole size return std::make_unique( @@ -1819,7 +1819,7 @@ struct contiguous_split_state { CUDF_EXPECTS(has_next(), "Cannot call contiguous_split_chunk with has_next() == false!"); std::size_t starting_batch, num_batches_to_copy; - std::tie(starting_batch, num_batches_to_copy) = + auto [starting_batch, num_batches_to_copy] = chunk_iter_state->get_current_starting_index_and_buff_count(); // perform the copy. @@ -1962,7 +1962,7 @@ struct contiguous_split_state { // contiguous_split will behave in a "chunked" mode in this scenario, as it will contiguously // copy up until the user's buffer size limit, exposing a next() call for the user to invoke. // Note that in this mode, contig split is not partitioning the original table, it is instead - // only placing cuDF buffers contiguously in the user's bounce buffer. + // only placing cuDF buffers contiguously in the user's buffer. // // - single shot contiguous_split (default): when the user doesn't provide their own buffer, // contiguous_split will allocate a buffer per partition and will place contiguous results in From 4a4da2f8d57ff68f17d48e6831f09c112aaf6516 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Sat, 6 May 2023 06:20:16 -0500 Subject: [PATCH 04/31] Use rmm::device_uvector instead of rmm::device_buffer in benchmark --- cpp/benchmarks/copying/contiguous_split.cu | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu index 5b95370faf0..daa80cdd425 100644 --- a/cpp/benchmarks/copying/contiguous_split.cu +++ b/cpp/benchmarks/copying/contiguous_split.cu @@ -32,14 +32,12 @@ void contiguous_split(cudf::table_view const& src_table, std::vector const&) { - auto mr = rmm::mr::get_current_device_resource(); - auto stream = cudf::get_default_stream(); - rmm::device_buffer user_buffer(100L * 1024 * 1024, stream, mr); + auto const mr = rmm::mr::get_current_device_resource(); + auto const stream = cudf::get_default_stream(); + auto user_buffer = rmm::device_uvector(100L * 1024 * 1024, stream, mr); auto chunked_pack = cudf::make_chunked_pack(src_table, user_buffer.size(), mr); - auto user_buffer_span = - cudf::device_span(static_cast(user_buffer.data()), user_buffer.size()); while (chunked_pack->has_next()) { - auto iter_size = chunked_pack->next(user_buffer_span); + auto iter_size = chunked_pack->next(user_buffer); } stream.synchronize(); } From bd87dd5d3fa1039090c53af4d7cb7b6c9cd130e5 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Sun, 7 May 2023 13:06:07 -0500 Subject: [PATCH 05/31] Address review comments --- cpp/include/cudf/contiguous_split.hpp | 6 +- cpp/src/copying/contiguous_split.cu | 176 +++++++++++++++----------- 2 files changed, 102 insertions(+), 80 deletions(-) diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index 61edf6a3564..fe6e7bc3e96 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -155,13 +155,13 @@ struct contiguous_split_state; * auto stream = cudf::get_default_stream(); * * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be - * // occupied by this algorithm. + * // occupied by this algorithm. * // * // Internally, the GPU unit-of-work is a 1MB batch. When we instantiate `cudf::chunked_pack`, - * // all the 1MB batches for the source table_view are computed up front. Additionally, + * // all the 1MB batches for the source table_view are computed up front. Additionally, * // chunked_pack calculates the number of iterations that are required to go through all those * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk) - * // equals the number of CUDA blocks that will be used for the actual work. + * // equals the number of CUDA blocks that will be used for the main kernel launch. * // * std::size_t user_buffer_size = 128*1024*1024; * diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 4674321436f..f75d1bf5297 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -51,10 +51,15 @@ namespace cudf { namespace { -// align all column size allocations to this boundary so that all output column buffers +// Align all column size allocations to this boundary so that all output column buffers // start at that alignment. static constexpr std::size_t split_align = 64; +// The size that contiguous split uses internally as the GPU unit of work. +// The number of `desired_batch_size` batches equals the number of CUDA blocks +// that will be used for the main kernel launch (`copy_partitions`). +static constexpr std::size_t desired_batch_size = 1 * 1024 * 1024; + /** * @brief Struct which contains information on a source buffer. * @@ -268,51 +273,13 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst, * to be done as evenly as possible across the multiprocessors on the device. * This kernel is arranged such that each block copies 1 source/destination pair. * + * @param index_to_buffer A function that given a `buf_index` returns the destination buffer * @param src_bufs Input source buffers - * @param dst_bufs Destination buffers - * @param buf_info Information on the range of values to be copied for each destination buffer - */ -template -__global__ void copy_partitions(uint8_t const** src_bufs, - uint8_t** dst_bufs, - dst_buf_info* buf_info) -{ - auto const buf_index = blockIdx.x; - auto const src_buf_index = buf_info[buf_index].src_buf_index; - auto const dst_buf_index = buf_info[buf_index].dst_buf_index; - - // copy, shifting offsets and validity bits as needed - copy_buffer( - dst_bufs[dst_buf_index] + buf_info[buf_index].dst_offset, - src_bufs[src_buf_index], - threadIdx.x, - buf_info[buf_index].num_elements, - buf_info[buf_index].element_size, - buf_info[buf_index].src_element_index, - blockDim.x, - buf_info[buf_index].value_shift, - buf_info[buf_index].bit_shift, - buf_info[buf_index].num_rows, - buf_info[buf_index].valid_count > 0 ? &buf_info[buf_index].valid_count : nullptr); -} - -/** - * @brief Kernel which copies data from multiple source buffers to multiple - * destination buffers. - * - * When doing a contiguous_split on X columns comprising N total internal buffers - * with M splits, we end up having to copy N*M source/destination buffer pairs. - * These copies are further subdivided into batches to distribute the amount of work - * to be done as evenly as possible across the multiprocessors on the device. - * This kernel is arranged such that each block copies 1 source/destination pair. - * - * @param src_bufs Input source buffers - * @param dst_bufs Destination buffers * @param buf_info Information on the range of values to be copied for each destination buffer */ -template -__global__ void copy_partitions(uint8_t const** src_bufs, - uint8_t* user_buffer, +template +__global__ void copy_partitions(IndexToDstBuf index_to_buffer, + uint8_t const** src_bufs, dst_buf_info* buf_info) { auto const buf_index = blockIdx.x; @@ -320,7 +287,7 @@ __global__ void copy_partitions(uint8_t const** src_bufs, // copy, shifting offsets and validity bits as needed copy_buffer( - user_buffer + buf_info[buf_index].dst_offset, + index_to_buffer(buf_index) + buf_info[buf_index].dst_offset, src_bufs[src_buf_index], threadIdx.x, buf_info[buf_index].num_elements, @@ -776,7 +743,6 @@ std::tuple build_output_column_metadata( * copied buffer * @param out_begin Output iterator of column views * @param base_ptr Pointer to the base address of copied data for the working partition - * @param mb packed column metadata builder * * @returns new dst_buf_info iterator after processing this range of input columns */ @@ -790,9 +756,6 @@ BufInfo build_output_columns(InputIter begin, { auto current_info = info_begin; std::transform(begin, end, out_begin, [¤t_info, base_ptr, &mb](column_view const& src) { - size_type col_size, null_count; - int64_t bitmask_offset; - int64_t data_offset; auto [col_size, data_offset, bitmask_offset, null_count] = build_output_column_metadata(src, current_info, mb, false); @@ -990,10 +953,6 @@ struct out_to_in_index_function { } }; -}; // anonymous namespace - -namespace detail { - // packed block of memory 1: split indices and src_buf_info structs struct packed_split_indices_and_src_buf_info { explicit packed_split_indices_and_src_buf_info(cudf::table_view const& input, @@ -1114,7 +1073,6 @@ struct packed_src_and_dst_pointers { packed_src_and_dst_pointers(cudf::table_view const& input, std::size_t num_partitions, cudf::size_type num_src_bufs, - int num_iterations, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) : stream(stream) @@ -1159,28 +1117,61 @@ struct packed_src_and_dst_pointers { uint8_t** d_dst_bufs; }; +}; // anonymous namespace + +namespace detail { + +/** + * @brief Create an instance of `packed_src_and_dst_pointers` populating destination + * partitition buffers (if any) from `out_buffers`. In the chunked_pack case + * `out_buffers` is empty, and the destination pointer is provided separately + * to the `copy_partitions` kernel. + * + * @param input source table view + * @param num_partitions the number of partitions create (1 meaning no splits) + * @param num_src_bufs number of buffers for the source columns including children + * @param out_buffers the destination buffers per partition if in the non-chunked case + * @param stream Optional CUDA stream on which to execute kernels + * @param mr RMM memory resource + * + * @returns new unique pointer to packed_src_and_dst_pointers + */ std::unique_ptr setup_src_and_dst_pointers( cudf::table_view const& input, std::size_t num_partitions, cudf::size_type num_src_bufs, - int num_iterations, std::vector& out_buffers, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto src_and_dst_pointers = std::make_unique( - input, num_partitions, num_src_bufs, num_iterations, stream, mr); + auto src_and_dst_pointers = + std::make_unique(input, num_partitions, num_src_bufs, stream, mr); std::transform( out_buffers.begin(), out_buffers.end(), src_and_dst_pointers->h_dst_bufs, [](auto& buf) { return static_cast(buf.data()); }); + // copy the struct to device memory to access from the kernel src_and_dst_pointers->copy_to_device(); return src_and_dst_pointers; } +/** + * @brief Create an instance of `packed_partition_buf_size_and_dst_buf_info` containing + * the partition-level dst_buf_info structs for each partition and column buffer. + * + * @param input source table view + * @param splits the numeric value (in rows) for each split, empty for 1 partition + * @param num_partitions the number of partitions create (1 meaning no splits) + * @param num_src_bufs number of buffers for the source columns including children + * @param num_bufs num_src_bufs times the number of partitions + * @param stream Optional CUDA stream on which to execute kernels + * @param mr RMM memory resource + * + * @returns new unique pointer to `packed_partition_buf_size_and_dst_buf_info` + */ std::unique_ptr compute_splits( cudf::table_view const& input, std::vector const& splits, @@ -1341,7 +1332,7 @@ struct chunk_iteration_state { std::size_t total_size) : num_iterations(_h_num_buffs_per_iteration.size()), current_iteration(0), - starting_buff(0), + starting_batch(0), d_batched_dst_buf_info(std::move(_d_batched_dst_buf_info)), d_batch_offsets(std::move(_d_batch_offsets)), h_num_buffs_per_iteration(std::move(_h_num_buffs_per_iteration)), @@ -1350,12 +1341,18 @@ struct chunk_iteration_state { { } + /** + * @brief As of the time of the call, return the starting 1MB batch index, and the + * number of batches to copy. + * + * @return the current iteration's starting_batch and batch count as a pair + */ std::pair get_current_starting_index_and_buff_count() const { CUDF_EXPECTS(current_iteration < num_iterations, "current_iteration cannot exceed num_iterations"); auto count_for_current = h_num_buffs_per_iteration[current_iteration]; - return std::make_pair(starting_buff, count_for_current); + return std::make_pair(starting_batch, count_for_current); } std::size_t advance_iteration() @@ -1363,7 +1360,7 @@ struct chunk_iteration_state { CUDF_EXPECTS(current_iteration < num_iterations, "current_iteration cannot exceed num_iterations"); std::size_t bytes_copied = h_size_of_buffs_per_iteration[current_iteration]; - starting_buff += h_num_buffs_per_iteration[current_iteration]; + starting_batch += h_num_buffs_per_iteration[current_iteration]; ++current_iteration; return bytes_copied; } @@ -1377,7 +1374,7 @@ struct chunk_iteration_state { int current_iteration; private: - std::size_t starting_buff; + std::size_t starting_batch; std::vector h_num_buffs_per_iteration; std::vector h_size_of_buffs_per_iteration; }; @@ -1466,6 +1463,14 @@ std::unique_ptr make_chunk_iteration_state( // underneath the final structure of the output }); + /** + * In the chunked case, this is the code that fixes up the offsets of each batch + * and prepares each iteration. Given the batches computed before, it figures + * out the number of batches that will fit in an iteration of `user_buffer_size`. + * + * Specifically, offsets for batches are reset to the 0th byte when a new iteration + * of `user_buffer_size` bytes is needed. + */ if (user_buffer_size != 0) { // copy the batch offsets back to host std::vector h_offsets(num_batches + 1); @@ -1582,6 +1587,25 @@ std::unique_ptr make_chunk_iteration_state( } } +// template +// struct index_to_buffer_func { +// uint8_t **d_dst_bufs; +// rmm::device_uvector&d_dst_buf_info; +// uint8_t *user_buffer; + +// typename std::enable_if +// __device__ operator()(unsigned int) const +// { +// return user_buffer; +// } + +// typename std::enable_if +// __device__ operator()(unsigned int) const +// { +// auto const dst_buf_index = dst_buf_info[buf_index].dst_buf_index; +// return d_dst_bufs[dst_buf_index]; +// } +//}; void copy_data(int num_batches_to_copy, int starting_batch, uint8_t const** d_src_bufs, @@ -1592,11 +1616,18 @@ void copy_data(int num_batches_to_copy, { constexpr size_type block_size = 256; if (user_buffer != nullptr) { + auto index_to_buffer = [user_buffer] __device__(unsigned int) { return user_buffer; }; copy_partitions<<>>( - d_src_bufs, user_buffer, d_dst_buf_info.data() + starting_batch); + index_to_buffer, d_src_bufs, d_dst_buf_info.data() + starting_batch); } else { + auto index_to_buffer = [d_dst_bufs, + dst_buf_info = d_dst_buf_info.data(), + user_buffer] __device__(unsigned int buf_index) { + auto const dst_buf_index = dst_buf_info[buf_index].dst_buf_index; + return d_dst_bufs[dst_buf_index]; + }; copy_partitions<<>>( - d_src_bufs, d_dst_bufs, d_dst_buf_info.data() + starting_batch); + index_to_buffer, d_src_bufs, d_dst_buf_info.data() + starting_batch); } } @@ -1648,8 +1679,6 @@ bool check_inputs(cudf::table_view const& input, std::vector const& s * None of the methods are thread safe. */ struct contiguous_split_state { - static const std::size_t desired_batch_size = 1 * 1024 * 1024; - contiguous_split_state(cudf::table_view const& input, std::size_t user_buffer_size, rmm::cuda_stream_view stream, @@ -1700,13 +1729,8 @@ struct contiguous_split_state { }); } - src_and_dst_pointers = std::move(setup_src_and_dst_pointers(input, - num_partitions, - num_src_bufs, - chunk_iter_state->num_iterations, - out_buffers, - stream, - mr)); + src_and_dst_pointers = std::move( + setup_src_and_dst_pointers(input, num_partitions, num_src_bufs, out_buffers, stream, mr)); } bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); } @@ -1724,15 +1748,14 @@ struct contiguous_split_state { // them into much smaller batches in order to drive up the number of blocks and overall // occupancy. rmm::device_uvector> batches(num_bufs, stream, mr); - auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; - auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; - auto desired_batch_size = contiguous_split_state::desired_batch_size; + auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; + auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; thrust::transform( rmm::exec_policy(stream, mr), d_dst_buf_info, d_dst_buf_info + num_bufs, batches.begin(), - [desired_batch_size] __device__( + [desired_batch_size = desired_batch_size] __device__( dst_buf_info const& buf) -> thrust::pair { // Total bytes for this incoming partition std::size_t const bytes = @@ -1818,7 +1841,6 @@ struct contiguous_split_state { "Cannot use a device span smaller than the output buffer size configured at instantiation!"); CUDF_EXPECTS(has_next(), "Cannot call contiguous_split_chunk with has_next() == false!"); - std::size_t starting_batch, num_batches_to_copy; auto [starting_batch, num_batches_to_copy] = chunk_iter_state->get_current_starting_index_and_buff_count(); @@ -2024,7 +2046,7 @@ std::unique_ptr make_chunked_pack(cudf::table_view const& input, std::size_t user_buffer_size, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(user_buffer_size >= detail::contiguous_split_state::desired_batch_size, + CUDF_EXPECTS(user_buffer_size >= desired_batch_size, "The output buffer size must be at least 1MB in size"); return std::make_unique(input, user_buffer_size, cudf::get_default_stream(), mr); } From 3b83b03ac8483d3935849340603bb9226aa8ce5c Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Sun, 7 May 2023 16:03:12 -0500 Subject: [PATCH 06/31] Add more comments in the contiguous_split_state constructor --- cpp/src/copying/contiguous_split.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index f75d1bf5297..d18117fba39 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1712,9 +1712,13 @@ struct contiguous_split_state { // is the result. if (is_empty) { return; } + // First pass over the source tables to generate a `dst_buf_info` per split and column buffer + // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work + // into 1MB batches in `compute_batches` partition_buf_size_and_dst_buf_info = std::move(compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr)); + // Second pass: uses `dst_buf_info` to break down the work into 1MB batches. compute_batches(); // allocate output partition buffers, in the non-chunked case From 425a3132f465d0257440983e054ddfb048db9216 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 8 May 2023 22:53:07 -0500 Subject: [PATCH 07/31] Use std::get<> and remove commented out code --- cpp/src/copying/contiguous_split.cu | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index d18117fba39..8e531a8de99 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1587,25 +1587,6 @@ std::unique_ptr make_chunk_iteration_state( } } -// template -// struct index_to_buffer_func { -// uint8_t **d_dst_bufs; -// rmm::device_uvector&d_dst_buf_info; -// uint8_t *user_buffer; - -// typename std::enable_if -// __device__ operator()(unsigned int) const -// { -// return user_buffer; -// } - -// typename std::enable_if -// __device__ operator()(unsigned int) const -// { -// auto const dst_buf_index = dst_buf_info[buf_index].dst_buf_index; -// return d_dst_bufs[dst_buf_index]; -// } -//}; void copy_data(int num_batches_to_copy, int starting_batch, uint8_t const** d_src_bufs, @@ -1788,9 +1769,8 @@ struct contiguous_split_state { CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer"); if (is_empty || input.num_columns() == 0) { return make_packed_tables(); } - std::size_t num_batches_total; - std::tie(std::ignore, num_batches_total) = - chunk_iter_state->get_current_starting_index_and_buff_count(); + auto const num_batches_total = std::get<1>( + chunk_iter_state->get_current_starting_index_and_buff_count()); // perform the copy. copy_data(num_batches_total, From cc9acb413f3fc836282438dc47bc3e4644cf2e5a Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Tue, 9 May 2023 08:51:30 -0500 Subject: [PATCH 08/31] fix styles --- cpp/src/copying/contiguous_split.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 8e531a8de99..64da96e4ce4 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1769,8 +1769,8 @@ struct contiguous_split_state { CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer"); if (is_empty || input.num_columns() == 0) { return make_packed_tables(); } - auto const num_batches_total = std::get<1>( - chunk_iter_state->get_current_starting_index_and_buff_count()); + auto const num_batches_total = + std::get<1>(chunk_iter_state->get_current_starting_index_and_buff_count()); // perform the copy. copy_data(num_batches_total, From 9b3a591d4938e9b8a0eb0826b2b177202c3ae05b Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 10 May 2023 14:57:45 -0500 Subject: [PATCH 09/31] chunked_pack::create and address other reviewer feedback --- cpp/benchmarks/copying/contiguous_split.cu | 2 +- cpp/include/cudf/contiguous_split.hpp | 37 +++++++++++----------- cpp/src/copying/contiguous_split.cu | 21 ++++++------ cpp/tests/copying/split_tests.cpp | 10 +++--- 4 files changed, 33 insertions(+), 37 deletions(-) diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu index daa80cdd425..89fb4d57088 100644 --- a/cpp/benchmarks/copying/contiguous_split.cu +++ b/cpp/benchmarks/copying/contiguous_split.cu @@ -35,7 +35,7 @@ void chunked_pack(cudf::table_view const& src_table, std::vector(100L * 1024 * 1024, stream, mr); - auto chunked_pack = cudf::make_chunked_pack(src_table, user_buffer.size(), mr); + auto chunked_pack = cudf::chunked_pack::create(src_table, user_buffer.size(), mr); while (chunked_pack->has_next()) { auto iter_size = chunked_pack->next(user_buffer); } diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index fe6e7bc3e96..7296b6905a3 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -165,7 +165,7 @@ struct contiguous_split_state; * // * std::size_t user_buffer_size = 128*1024*1024; * - * auto chunked_packer = make_chunked_pack(tv, user_buffer_size, stream, mr); + * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, stream, mr); * * std::size_t host_offset = 0; * auto host_buffer = ...; // obtain a host buffer you would like to copy to @@ -236,7 +236,6 @@ class chunked_pack { * the `user_buffer_size` parameter passed at construction * @return The number of bytes that were written to `user_buffer` (at most * `user_buffer_size`) - * */ [[nodiscard]] std::size_t next(cudf::device_span const& user_buffer); @@ -247,28 +246,28 @@ class chunked_pack { */ [[nodiscard]] std::unique_ptr> build_metadata() const; + /** + * @brief Creates a `chunked_pack` instance to perform a "pack" of the `table_view` + * "input", where a buffer of `user_buffer_size` is filled with chunks of the + * overall operation. This operation can be used in cases where GPU memory is constrained. + * + * @throws cudf::logic_error When user_buffer_size is less than 1MB + * + * @param input source `table_view` to pack + * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be + * at least 1MB + * @param mr RMM memory resource to be used for temporary and scratch allocations only + * @return a unique_ptr of chunked_pack + */ + [[nodiscard]] static std::unique_ptr create(cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* mr); + private: // internal state of contiguous split std::unique_ptr state; }; -/** - * @brief Created a `chunked_pack` instance to perform a "pack" of the `table_view` - * "input", where a buffer of `user_buffer_size` is filled with chunks of the - * overall operation. This operation can be used in cases where GPU memory is constrained. - * - * @throws cudf::logic_error When user_buffer_size is less than 1MB - * - * @param input source `table_view` to pack - * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be - * at least 1MB - * @param mr RMM memory resource to be used for temporary and scratch allocations only - * @return a unique_ptr of chunked_pack - */ -std::unique_ptr make_chunked_pack(cudf::table_view const& input, - std::size_t user_buffer_size, - rmm::mr::device_memory_resource* mr); - /** * @brief Deep-copy a `table_view` into a serialized contiguous memory format. * diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 64da96e4ce4..eb08cbb867f 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -759,7 +759,7 @@ BufInfo build_output_columns(InputIter begin, auto [col_size, data_offset, bitmask_offset, null_count] = build_output_column_metadata(src, current_info, mb, false); - auto bitmask_ptr = + auto const bitmask_ptr = base_ptr != nullptr && bitmask_offset != -1 ? reinterpret_cast(base_ptr + static_cast(bitmask_offset)) : nullptr; @@ -927,14 +927,11 @@ struct batch_byte_size_function { dst_buf_info const* infos; __device__ std::size_t operator()(size_type i) const { - if (i == num_batches) { - return 0; - } else { - auto& buf = *(infos + i); - std::size_t const bytes = - static_cast(buf.num_elements) * static_cast(buf.element_size); - return util::round_up_unsafe(bytes, split_align); - } + if (i == num_batches) { return 0; } + auto& buf = *(infos + i); + std::size_t const bytes = + static_cast(buf.num_elements) * static_cast(buf.element_size); + return util::round_up_unsafe(bytes, split_align); } }; @@ -2026,9 +2023,9 @@ std::unique_ptr> chunked_pack::build_metadata() const return state->build_packed_column_metadata(); } -std::unique_ptr make_chunked_pack(cudf::table_view const& input, - std::size_t user_buffer_size, - rmm::mr::device_memory_resource* mr) +std::unique_ptr chunked_pack::create(cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(user_buffer_size >= desired_batch_size, "The output buffer size must be at least 1MB in size"); diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index 51c42bd7cf0..73da4510dfc 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -1322,7 +1322,7 @@ std::vector do_chunked_pack(cudf::table_view const& input) auto bounce_buff_span = cudf::device_span(static_cast(bounce_buff.data()), bounce_buff.size()); - auto chunked_pack = cudf::make_chunked_pack(input, bounce_buff_span.size(), mr); + auto chunked_pack = cudf::chunked_pack::create(input, bounce_buff_span.size(), mr); // right size the final buffer rmm::device_buffer final_buff( @@ -2309,12 +2309,12 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall) { // internally, contiguous split chunks GPU work in 1MB contiguous copies // so the output buffer must be 1MB or larger. - EXPECT_THROW(cudf::make_chunked_pack({}, 1 * 1024, mr()), cudf::logic_error); + EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024, mr()), cudf::logic_error); } TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall) { - auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr()); + auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr()); rmm::device_buffer buff(1 * 1024, cudf::get_default_stream(), mr()); cudf::device_span too_small(static_cast(buff.data()), buff.size()); std::size_t copied = 0; @@ -2326,7 +2326,7 @@ TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall) TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse) { - auto chunked_pack = cudf::make_chunked_pack({}, 1 * 1024 * 1024, mr()); + auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024, mr()); rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr()); cudf::device_span bounce_buff(static_cast(buff.data()), buff.size()); EXPECT_EQ(chunked_pack->has_next(), false); // empty input table @@ -2341,7 +2341,7 @@ TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse) cudf::table_view t({a}); rmm::device_buffer buff(1 * 1024 * 1024, cudf::get_default_stream(), mr()); cudf::device_span bounce_buff(static_cast(buff.data()), buff.size()); - auto chunked_pack = cudf::make_chunked_pack(t, buff.size(), mr()); + auto chunked_pack = cudf::chunked_pack::create(t, buff.size(), mr()); EXPECT_EQ(chunked_pack->has_next(), true); std::size_t copied = chunked_pack->next(bounce_buff); EXPECT_EQ(copied, chunked_pack->get_total_contiguous_size()); From 8f9dbbd9accf489eb29ec8b1cd8c60419b2104eb Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 11 May 2023 20:54:53 -0500 Subject: [PATCH 10/31] Update due to code review comments --- cpp/src/copying/contiguous_split.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index eb08cbb867f..8f9f17e1755 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -42,7 +42,6 @@ #include #include #include -#include #include #include @@ -723,7 +722,7 @@ std::tuple build_output_column_metadata( src.type(), col_size, null_count, data_offset, bitmask_offset, src.num_children()); ++current_info; - return std::make_tuple(col_size, data_offset, bitmask_offset, null_count); + return {col_size, data_offset, bitmask_offset, null_count}; } /** From e2ca459faa94f9564cbf325e425a4e7708e1f35f Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 12 May 2023 13:55:45 -0500 Subject: [PATCH 11/31] Const fixes as suggested in review --- cpp/src/copying/contiguous_split.cu | 89 ++++++++++++++--------------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 8f9f17e1755..55aee27d6db 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -833,8 +833,8 @@ struct buf_size_functor { * The key is simply the partition index. */ struct split_key_functor { - int num_src_bufs; - int operator() __device__(int buf_index) { return buf_index / num_src_bufs; } + int const num_src_bufs; + int operator() __device__(int buf_index) const { return buf_index / num_src_bufs; } }; /** @@ -914,7 +914,7 @@ struct size_of_helper { * structs) return 0. */ struct num_batches_func { - thrust::pair const* batches; + thrust::pair const* const batches; __device__ std::size_t operator()(size_type i) const { return thrust::get<0>(batches[i]); } }; @@ -922,12 +922,12 @@ struct num_batches_func { * @brief Get the size in bytes of a batch described by `dst_buf_info`. */ struct batch_byte_size_function { - size_type num_batches; - dst_buf_info const* infos; + size_type const num_batches; + dst_buf_info const* const infos; __device__ std::size_t operator()(size_type i) const { if (i == num_batches) { return 0; } - auto& buf = *(infos + i); + auto const& buf = *(infos + i); std::size_t const bytes = static_cast(buf.num_elements) * static_cast(buf.element_size); return util::round_up_unsafe(bytes, split_align); @@ -938,8 +938,8 @@ struct batch_byte_size_function { * @brief Get the input buffer index given the output buffer index. */ struct out_to_in_index_function { - offset_type const* batch_offsets; - int num_bufs; + offset_type const* const batch_offsets; + int const num_bufs; __device__ int operator()(size_type i) const { return static_cast( @@ -1018,10 +1018,10 @@ struct packed_partition_buf_size_and_dst_buf_info { std::size_t num_bufs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : stream(stream) + : stream(stream), + buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)}, + dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)} { - buf_sizes_size = cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align); - dst_buf_info_size = cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align); // host-side h_buf_sizes_and_dst_info = std::vector(buf_sizes_size + dst_buf_info_size); h_buf_sizes = reinterpret_cast(h_buf_sizes_and_dst_info.data()); @@ -1047,11 +1047,11 @@ struct packed_partition_buf_size_and_dst_buf_info { stream.value())); } - rmm::cuda_stream_view stream; + rmm::cuda_stream_view const stream; // buffer sizes and destination info (used in batched copies) - std::size_t buf_sizes_size; - std::size_t dst_buf_info_size; + std::size_t const buf_sizes_size; + std::size_t const dst_buf_info_size; std::vector h_buf_sizes_and_dst_info; std::size_t* h_buf_sizes; @@ -1071,12 +1071,10 @@ struct packed_src_and_dst_pointers { cudf::size_type num_src_bufs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : stream(stream) + : stream(stream), + src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)}, + dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)} { - src_bufs_size = cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align); - - dst_bufs_size = cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align); - // host-side h_src_and_dst_buffers = std::vector(src_bufs_size + dst_bufs_size); h_src_bufs = reinterpret_cast(h_src_and_dst_buffers.data()); @@ -1101,12 +1099,12 @@ struct packed_src_and_dst_pointers { stream.value())); } - const rmm::cuda_stream_view stream; + rmm::cuda_stream_view const stream; + std::size_t const src_bufs_size; + std::size_t const dst_bufs_size; std::vector h_src_and_dst_buffers; rmm::device_buffer d_src_and_dst_buffers; - std::size_t src_bufs_size; - std::size_t dst_bufs_size; const uint8_t** h_src_bufs; const uint8_t** d_src_bufs; uint8_t** h_dst_bufs; @@ -1181,17 +1179,18 @@ std::unique_ptr compute_splits( std::make_unique( input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr); - auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; - auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; - auto d_buf_sizes = partition_buf_size_and_dst_buf_info->d_buf_sizes; + auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; + auto const h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; + auto const d_buf_sizes = partition_buf_size_and_dst_buf_info->d_buf_sizes; - auto split_indices_and_src_buf_info = + auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info(input, splits, num_partitions, num_src_bufs, stream, mr); - auto const d_src_buf_info = split_indices_and_src_buf_info.d_src_buf_info; - auto offset_stack_partition_size = split_indices_and_src_buf_info.offset_stack_partition_size; - auto d_offset_stack = split_indices_and_src_buf_info.d_offset_stack; - auto d_indices = split_indices_and_src_buf_info.d_indices; + auto const d_src_buf_info = split_indices_and_src_buf_info.d_src_buf_info; + auto const offset_stack_partition_size = + split_indices_and_src_buf_info.offset_stack_partition_size; + auto const d_offset_stack = split_indices_and_src_buf_info.d_offset_stack; + auto const d_indices = split_indices_and_src_buf_info.d_indices; // compute sizes of each column in each partition, including alignment. thrust::transform( @@ -1275,7 +1274,7 @@ std::unique_ptr compute_splits( // compute total size of each partition // key is the split index { - auto keys = cudf::detail::make_counting_transform_iterator( + auto const keys = cudf::detail::make_counting_transform_iterator( 0, split_key_functor{static_cast(num_src_bufs)}); auto values = cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info}); @@ -1290,7 +1289,7 @@ std::unique_ptr compute_splits( // compute start offset for each output buffer for each split { - auto keys = cudf::detail::make_counting_transform_iterator( + auto const keys = cudf::detail::make_counting_transform_iterator( 0, split_key_functor{static_cast(num_src_bufs)}); auto values = cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info}); @@ -1323,8 +1322,8 @@ std::unique_ptr compute_splits( struct chunk_iteration_state { chunk_iteration_state(rmm::device_uvector _d_batched_dst_buf_info, rmm::device_uvector _d_batch_offsets, - std::vector _h_num_buffs_per_iteration, - std::vector _h_size_of_buffs_per_iteration, + std::vector&& _h_num_buffs_per_iteration, + std::vector&& _h_size_of_buffs_per_iteration, std::size_t total_size) : num_iterations(_h_num_buffs_per_iteration.size()), current_iteration(0), @@ -1364,15 +1363,15 @@ struct chunk_iteration_state { bool has_more_copies() const { return current_iteration < num_iterations; } rmm::device_uvector d_batched_dst_buf_info; - rmm::device_uvector d_batch_offsets; - std::size_t total_size; - int num_iterations; + rmm::device_uvector const d_batch_offsets; + std::size_t const total_size; + int const num_iterations; int current_iteration; private: std::size_t starting_batch; - std::vector h_num_buffs_per_iteration; - std::vector h_size_of_buffs_per_iteration; + std::vector const h_num_buffs_per_iteration; + std::vector const h_size_of_buffs_per_iteration; }; std::unique_ptr make_chunk_iteration_state( @@ -1387,7 +1386,7 @@ std::unique_ptr make_chunk_iteration_state( { rmm::device_uvector d_batch_offsets(num_bufs + 1, stream, mr); - auto buf_count_iter = cudf::detail::make_counting_transform_iterator( + auto const buf_count_iter = cudf::detail::make_counting_transform_iterator( 0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) { return i == num_bufs ? 0 : num_batches(i); }); @@ -1405,7 +1404,7 @@ std::unique_ptr make_chunk_iteration_state( auto out_to_in_index = out_to_in_index_function{d_batch_offsets.begin(), num_bufs}; - auto iter = thrust::make_counting_iterator(0); + auto const iter = thrust::make_counting_iterator(0); // load up the batches as d_dst_buf_info rmm::device_uvector d_batched_dst_buf_info(num_batches, stream, mr); @@ -1472,7 +1471,7 @@ std::unique_ptr make_chunk_iteration_state( std::vector h_offsets(num_batches + 1); { rmm::device_uvector offsets(h_offsets.size(), stream, mr); - auto batch_byte_size_iter = cudf::detail::make_counting_transform_iterator( + auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator( 0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()}); thrust::exclusive_scan(rmm::exec_policy(stream, mr), @@ -1544,7 +1543,7 @@ std::unique_ptr make_chunk_iteration_state( // we want to update the offset of batches for every iteration, except the first one (because // offsets in the first iteration are all 0 based) auto num_batches_in_first_iteration = num_batches_per_iteration[0]; - auto iter = thrust::make_counting_iterator(num_batches_in_first_iteration); + auto const iter = thrust::make_counting_iterator(num_batches_in_first_iteration); auto num_iterations = accum_size_per_iteration.size(); thrust::for_each( rmm::exec_policy(stream, mr), @@ -1784,7 +1783,7 @@ struct contiguous_split_state { // postprocess valid_counts: apply the valid counts computed by copy_data for each // batch back to the original dst_buf_infos - auto keys = cudf::detail::make_counting_transform_iterator( + auto const keys = cudf::detail::make_counting_transform_iterator( 0, out_to_in_index_function{chunk_iter_state->d_batch_offsets.begin(), (int)num_bufs}); auto values = thrust::make_transform_iterator( @@ -1858,7 +1857,7 @@ struct contiguous_split_state { // build the empty results std::vector result; result.reserve(num_partitions); - auto iter = thrust::make_counting_iterator(0); + auto const iter = thrust::make_counting_iterator(0); std::transform(iter, iter + num_partitions, std::back_inserter(result), From 394ab62280d2a47bbd6acef4f1f2c7fb8b4a6ad3 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 12 May 2023 17:19:38 -0500 Subject: [PATCH 12/31] Const changes --- cpp/src/copying/contiguous_split.cu | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 55aee27d6db..807e1b14114 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -957,10 +957,9 @@ struct packed_split_indices_and_src_buf_info { cudf::size_type num_src_bufs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) + : indices_size(cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)), + src_buf_info_size(cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)) { - indices_size = cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align); - src_buf_info_size = cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align); - // host-side h_indices_and_source_info = std::vector(indices_size + src_buf_info_size); h_indices = reinterpret_cast(h_indices_and_source_info.data()); @@ -993,8 +992,8 @@ struct packed_split_indices_and_src_buf_info { d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value())); } - size_type indices_size; - std::size_t src_buf_info_size; + size_type const indices_size; + std::size_t const src_buf_info_size; std::size_t offset_stack_size; std::vector h_indices_and_source_info; @@ -1105,8 +1104,8 @@ struct packed_src_and_dst_pointers { std::vector h_src_and_dst_buffers; rmm::device_buffer d_src_and_dst_buffers; - const uint8_t** h_src_bufs; - const uint8_t** d_src_bufs; + uint8_t const** h_src_bufs; + uint8_t const** d_src_bufs; uint8_t** h_dst_bufs; uint8_t** d_dst_bufs; }; From c4864f1ecf728c542559fbae9dc1e88fb431a380 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Fri, 12 May 2023 17:25:35 -0500 Subject: [PATCH 13/31] Style fixes --- cpp/src/copying/contiguous_split.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 807e1b14114..4cc30cc6529 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -957,7 +957,8 @@ struct packed_split_indices_and_src_buf_info { cudf::size_type num_src_bufs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : indices_size(cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)), + : indices_size( + cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)), src_buf_info_size(cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)) { // host-side From be5fb0194eb7f1a31fd970c0ccaa61a6d1df1a4f Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 15 May 2023 19:23:20 -0500 Subject: [PATCH 14/31] Remove header that is not neededfrom split_tests.cpp --- cpp/tests/copying/split_tests.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index 73da4510dfc..f6e492fa3e2 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -31,7 +31,6 @@ #include #include -#include #include #include From e0686ff15a7b56567872a24c6645f09e228edeb3 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 15 May 2023 19:24:33 -0500 Subject: [PATCH 15/31] Changed to contiguous_split.hpp/.cu per review --- cpp/include/cudf/contiguous_split.hpp | 19 +++++++++---------- cpp/src/copying/contiguous_split.cu | 17 +++++++++-------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index 7296b6905a3..2756a8963cc 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -144,16 +144,15 @@ struct contiguous_split_state; * The caller has two methods it can use to carry out the chunked_pack: has_next and next. * Here is an example: * + * @code{.pseudo} * // Create a table_view * cudf::table_view tv = ...; * - * // Choose a memory resource. This memory resource is used for scratch/thrust temporary + * // Choose a memory resource (optional). This memory resource is used for scratch/thrust temporary * // data. In memory constrained cases, this can be used to set aside scratch memory * // for `chunked_pack` at the beginning of a program. * auto mr = rmm::mr::get_current_device_resource(); * - * auto stream = cudf::get_default_stream(); - * * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be * // occupied by this algorithm. * // @@ -165,7 +164,7 @@ struct contiguous_split_state; * // * std::size_t user_buffer_size = 128*1024*1024; * - * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, stream, mr); + * auto chunked_packer = cudf::chunked_pack::create(tv, user_buffer_size, mr); * * std::size_t host_offset = 0; * auto host_buffer = ...; // obtain a host buffer you would like to copy to @@ -187,6 +186,7 @@ struct contiguous_split_state; * * host_offset += bytes_copied; * } + * @endcode */ class chunked_pack { public: @@ -196,13 +196,12 @@ class chunked_pack { * @param input source `table_view` to pack * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be * at least 1MB - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr RMM memory resource to be used for temporary and scratch allocations only + * @param mr An optional memory resource to be used for temporary and scratch allocations only */ - explicit chunked_pack(cudf::table_view const& input, - std::size_t user_buffer_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); + explicit chunked_pack( + cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Destructor that will be implemented as default, required because diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index cfdef822303..1e366a800eb 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1111,10 +1111,6 @@ struct packed_src_and_dst_pointers { uint8_t** d_dst_bufs; }; -}; // anonymous namespace - -namespace detail { - /** * @brief Create an instance of `packed_src_and_dst_pointers` populating destination * partitition buffers (if any) from `out_buffers`. In the chunked_pack case @@ -1308,6 +1304,7 @@ std::unique_ptr compute_splits( return partition_buf_size_and_dst_buf_info; } + /** * @brief Struct containing information about the actual batches we will send to the * `copy_partitions` kernel and the number of iterations we need to carry out this copy. @@ -1637,6 +1634,10 @@ bool check_inputs(cudf::table_view const& input, std::vector const& s return input.column(0).size() == 0; } +}; // anonymous namespace + +namespace detail { + /** * @brief A helper struct containing the state of contiguous_split, whether the caller * is using the single-pass contiguous_split or chunked_pack. @@ -1887,7 +1888,7 @@ struct contiguous_split_state { auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info; auto cur_dst_buf_info = h_dst_buf_info; - metadata_builder mb{input.num_columns()}; + detail::metadata_builder mb{input.num_columns()}; populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb); @@ -1907,7 +1908,7 @@ struct contiguous_split_state { auto& h_dst_bufs = src_and_dst_pointers->h_dst_bufs; auto cur_dst_buf_info = h_dst_buf_info; - metadata_builder mb(input.num_columns()); + detail::metadata_builder mb(input.num_columns()); for (std::size_t idx = 0; idx < num_partitions; idx++) { // traverse the buffers and build the columns. @@ -1996,10 +1997,10 @@ std::vector contiguous_split(cudf::table_view const& input, chunked_pack::chunked_pack(cudf::table_view const& input, std::size_t user_buffer_size, - rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - state = std::make_unique(input, user_buffer_size, stream, mr); + state = std::make_unique( + input, user_buffer_size, cudf::get_default_stream(), mr); } // required for the unique_ptr to work with a non-complete type (contiguous_split_state) From 2a822e6c29381b6e90d2316602205df3509deb88 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Mon, 15 May 2023 19:43:46 -0500 Subject: [PATCH 16/31] Remove stream from API --- cpp/src/copying/contiguous_split.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 1e366a800eb..eb13900f6d5 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -2029,7 +2029,7 @@ std::unique_ptr chunked_pack::create(cudf::table_view const& input { CUDF_EXPECTS(user_buffer_size >= desired_batch_size, "The output buffer size must be at least 1MB in size"); - return std::make_unique(input, user_buffer_size, cudf::get_default_stream(), mr); + return std::make_unique(input, user_buffer_size, mr); } }; // namespace cudf From 2ebb68b3f8a04e6398493ea765f382c149a24edb Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Tue, 16 May 2023 09:35:27 -0500 Subject: [PATCH 17/31] Update return documentation Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/copying/contiguous_split.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index eb13900f6d5..a74ebcc07b1 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -679,7 +679,7 @@ std::pair setup_source_buf_info(InputIter begin, * count information. The null count should be taken * from `src` because this case is restricted to a single partition * (no splits) - * @returns a std::tuple containing: + * @returns a std::tuple containing: * column size, data offset, bitmask offset, and null count */ template From 0e10fdc901594154b47da2f87e453b1095f1427c Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Tue, 16 May 2023 13:16:44 -0500 Subject: [PATCH 18/31] Initializer list + const fixes --- cpp/src/copying/contiguous_split.cu | 83 ++++++++++++++--------------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index a74ebcc07b1..053a2cc3e05 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -959,14 +959,14 @@ struct packed_split_indices_and_src_buf_info { rmm::mr::device_memory_resource* mr) : indices_size( cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)), - src_buf_info_size(cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)) + src_buf_info_size( + cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)), + // host-side + h_indices_and_source_info(indices_size + src_buf_info_size), + h_indices{reinterpret_cast(h_indices_and_source_info.data())}, + h_src_buf_info{ + reinterpret_cast(h_indices_and_source_info.data() + indices_size)} { - // host-side - h_indices_and_source_info = std::vector(indices_size + src_buf_info_size); - h_indices = reinterpret_cast(h_indices_and_source_info.data()); - h_src_buf_info = - reinterpret_cast(h_indices_and_source_info.data() + indices_size); - // compute splits -> indices. // these are row numbers per split h_indices[0] = 0; @@ -1000,8 +1000,8 @@ struct packed_split_indices_and_src_buf_info { std::vector h_indices_and_source_info; rmm::device_buffer d_indices_and_source_info; - size_type* h_indices; - src_buf_info* h_src_buf_info; + size_type* const h_indices; + src_buf_info* const h_src_buf_info; int offset_stack_partition_size; size_type* d_indices; @@ -1020,21 +1020,19 @@ struct packed_partition_buf_size_and_dst_buf_info { rmm::mr::device_memory_resource* mr) : stream(stream), buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)}, - dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)} + dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)}, + // host-side + h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size), + h_buf_sizes{reinterpret_cast(h_buf_sizes_and_dst_info.data())}, + h_dst_buf_info{ + reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size)}, + // device-side + d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, mr), + d_buf_sizes{reinterpret_cast(d_buf_sizes_and_dst_info.data())}, + //// destination buffer info + d_dst_buf_info{reinterpret_cast( + static_cast(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)} { - // host-side - h_buf_sizes_and_dst_info = std::vector(buf_sizes_size + dst_buf_info_size); - h_buf_sizes = reinterpret_cast(h_buf_sizes_and_dst_info.data()); - h_dst_buf_info = - reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size); - - // device-side - d_buf_sizes_and_dst_info = rmm::device_buffer(buf_sizes_size + dst_buf_info_size, stream, mr); - d_buf_sizes = reinterpret_cast(d_buf_sizes_and_dst_info.data()); - - //// destination buffer info - d_dst_buf_info = reinterpret_cast( - static_cast(d_buf_sizes_and_dst_info.data()) + buf_sizes_size); } void copy_to_host() @@ -1054,12 +1052,12 @@ struct packed_partition_buf_size_and_dst_buf_info { std::size_t const dst_buf_info_size; std::vector h_buf_sizes_and_dst_info; - std::size_t* h_buf_sizes; - dst_buf_info* h_dst_buf_info; + std::size_t* const h_buf_sizes; + dst_buf_info* const h_dst_buf_info; rmm::device_buffer d_buf_sizes_and_dst_info; - std::size_t* d_buf_sizes; - dst_buf_info* d_dst_buf_info; + std::size_t* const d_buf_sizes; + dst_buf_info* const d_dst_buf_info; }; // Packed block of memory 3: @@ -1073,19 +1071,17 @@ struct packed_src_and_dst_pointers { rmm::mr::device_memory_resource* mr) : stream(stream), src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)}, - dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)} + dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)}, + // host-side + h_src_and_dst_buffers(src_bufs_size + dst_bufs_size), + h_src_bufs{reinterpret_cast(h_src_and_dst_buffers.data())}, + h_dst_bufs{reinterpret_cast(h_src_and_dst_buffers.data() + src_bufs_size)}, + // device-side + d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr)}, + d_src_bufs{reinterpret_cast(d_src_and_dst_buffers.data())}, + d_dst_bufs{reinterpret_cast( + reinterpret_cast(d_src_and_dst_buffers.data()) + src_bufs_size)} { - // host-side - h_src_and_dst_buffers = std::vector(src_bufs_size + dst_bufs_size); - h_src_bufs = reinterpret_cast(h_src_and_dst_buffers.data()); - h_dst_bufs = reinterpret_cast(h_src_and_dst_buffers.data() + src_bufs_size); - - // device-side - d_src_and_dst_buffers = rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr); - d_src_bufs = reinterpret_cast(d_src_and_dst_buffers.data()); - d_dst_bufs = reinterpret_cast( - reinterpret_cast(d_src_and_dst_buffers.data()) + src_bufs_size); - // setup src buffers setup_src_buf_data(input.begin(), input.end(), h_src_bufs); } @@ -1104,11 +1100,12 @@ struct packed_src_and_dst_pointers { std::size_t const dst_bufs_size; std::vector h_src_and_dst_buffers; + uint8_t const** const h_src_bufs; + uint8_t** const h_dst_bufs; + rmm::device_buffer d_src_and_dst_buffers; - uint8_t const** h_src_bufs; - uint8_t const** d_src_bufs; - uint8_t** h_dst_bufs; - uint8_t** d_dst_bufs; + uint8_t const** const d_src_bufs; + uint8_t** const d_dst_bufs; }; /** From 8c344a4beb320d37ad1c69cd9af9beb9f699d404 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Tue, 16 May 2023 13:41:20 -0500 Subject: [PATCH 19/31] Rely on free standing column_views for edge case + remove extra slash in comment --- cpp/src/copying/contiguous_split.cu | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 053a2cc3e05..ce7b7bf0ad1 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1029,7 +1029,7 @@ struct packed_partition_buf_size_and_dst_buf_info { // device-side d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, mr), d_buf_sizes{reinterpret_cast(d_buf_sizes_and_dst_info.data())}, - //// destination buffer info + // destination buffer info d_dst_buf_info{reinterpret_cast( static_cast(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)} { @@ -1838,18 +1838,13 @@ struct contiguous_split_state { std::vector make_empty_packed_table() { // sanitize the inputs (to handle corner cases like sliced tables) - std::vector> empty_columns; - empty_columns.reserve(input.num_columns()); - std::transform( - input.begin(), input.end(), std::back_inserter(empty_columns), [](column_view const& col) { - return cudf::empty_like(col); - }); std::vector empty_column_views; empty_column_views.reserve(input.num_columns()); - std::transform(empty_columns.begin(), - empty_columns.end(), + std::transform(input.begin(), + input.end(), std::back_inserter(empty_column_views), - [](std::unique_ptr const& col) { return col->view(); }); + [](column_view const& col) { return cudf::empty_like(col)->view(); }); + table_view empty_inputs(empty_column_views); // build the empty results From 7a73d57ad4330a052a21ae9bb3d5cccba4f569bb Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Tue, 16 May 2023 23:03:35 -0500 Subject: [PATCH 20/31] Remove extra header in split_tests.cpp --- cpp/tests/copying/split_tests.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index f6e492fa3e2..c9a53d6ebe0 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -28,7 +28,6 @@ #include #include #include -#include #include From dae5ceb1cf8c8345139ab3c62317da4dfd2706f5 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 17 May 2023 08:26:03 -0500 Subject: [PATCH 21/31] Introduce temp_mr. Comment edits and some more initializer list and constness issues --- cpp/include/cudf/contiguous_split.hpp | 24 ++-- cpp/src/copying/contiguous_split.cu | 191 ++++++++++++++++---------- 2 files changed, 129 insertions(+), 86 deletions(-) diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index 2756a8963cc..ce995a5c32a 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -141,8 +141,8 @@ struct contiguous_split_state; * all thrust and scratch memory allocations are using the passed-in memory resource exclusively, * not a per-device memory resource. * - * The caller has two methods it can use to carry out the chunked_pack: has_next and next. - * Here is an example: + * This class defines two methods that must be used in concert to carry out the chunked_pack: + * has_next and next. Here is an example: * * @code{.pseudo} * // Create a table_view @@ -156,7 +156,7 @@ struct contiguous_split_state; * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be * // occupied by this algorithm. * // - * // Internally, the GPU unit-of-work is a 1MB batch. When we instantiate `cudf::chunked_pack`, + * // Internally, the GPU unit of work is a 1MB batch. When we instantiate `cudf::chunked_pack`, * // all the 1MB batches for the source table_view are computed up front. Additionally, * // chunked_pack calculates the number of iterations that are required to go through all those * // batches given a `user_buffer_size` buffer. The number of 1MB batches in each iteration (chunk) @@ -196,15 +196,16 @@ class chunked_pack { * @param input source `table_view` to pack * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be * at least 1MB - * @param mr An optional memory resource to be used for temporary and scratch allocations only + * @param temp_mr An optional memory resource to be used for temporary and scratch allocations + * only */ explicit chunked_pack( cudf::table_view const& input, std::size_t user_buffer_size, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource()); /** - * @brief Destructor that will be implemented as default, required because + * @brief Destructor that will be implemented as default. Declared with definition here because * contiguous_split_state is incomplete at this stage. */ ~chunked_pack(); @@ -224,7 +225,7 @@ class chunked_pack { [[nodiscard]] bool has_next() const; /** - * @brief Packs the next chunk into `user_buffer`. This should be call as long as + * @brief Packs the next chunk into `user_buffer`. This should be called as long as * `has_next` returns true. If `next` is called when `has_next` is false, an exception * is thrown. * @@ -255,12 +256,13 @@ class chunked_pack { * @param input source `table_view` to pack * @param user_buffer_size buffer size (in bytes) that will be passed on `next`. Must be * at least 1MB - * @param mr RMM memory resource to be used for temporary and scratch allocations only + * @param temp_mr RMM memory resource to be used for temporary and scratch allocations only * @return a unique_ptr of chunked_pack */ - [[nodiscard]] static std::unique_ptr create(cudf::table_view const& input, - std::size_t user_buffer_size, - rmm::mr::device_memory_resource* mr); + [[nodiscard]] static std::unique_ptr create( + cudf::table_view const& input, + std::size_t user_buffer_size, + rmm::mr::device_memory_resource* temp_mr); private: // internal state of contiguous split diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index ce7b7bf0ad1..d982c11aee8 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -956,7 +956,7 @@ struct packed_split_indices_and_src_buf_info { std::size_t num_partitions, cudf::size_type num_src_bufs, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) : indices_size( cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)), src_buf_info_size( @@ -981,7 +981,7 @@ struct packed_split_indices_and_src_buf_info { // device-side // gpu-only : stack space needed for nested list offset calculation d_indices_and_source_info = - rmm::device_buffer(indices_size + src_buf_info_size + offset_stack_size, stream, mr); + rmm::device_buffer(indices_size + src_buf_info_size + offset_stack_size, stream, temp_mr); d_indices = reinterpret_cast(d_indices_and_source_info.data()); d_src_buf_info = reinterpret_cast( reinterpret_cast(d_indices_and_source_info.data()) + indices_size); @@ -1017,7 +1017,7 @@ struct packed_partition_buf_size_and_dst_buf_info { cudf::size_type num_src_bufs, std::size_t num_bufs, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) : stream(stream), buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)}, dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)}, @@ -1027,7 +1027,7 @@ struct packed_partition_buf_size_and_dst_buf_info { h_dst_buf_info{ reinterpret_cast(h_buf_sizes_and_dst_info.data() + buf_sizes_size)}, // device-side - d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, mr), + d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, temp_mr), d_buf_sizes{reinterpret_cast(d_buf_sizes_and_dst_info.data())}, // destination buffer info d_dst_buf_info{reinterpret_cast( @@ -1068,7 +1068,7 @@ struct packed_src_and_dst_pointers { std::size_t num_partitions, cudf::size_type num_src_bufs, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) : stream(stream), src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)}, dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)}, @@ -1077,7 +1077,7 @@ struct packed_src_and_dst_pointers { h_src_bufs{reinterpret_cast(h_src_and_dst_buffers.data())}, h_dst_bufs{reinterpret_cast(h_src_and_dst_buffers.data() + src_bufs_size)}, // device-side - d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, mr)}, + d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, temp_mr)}, d_src_bufs{reinterpret_cast(d_src_and_dst_buffers.data())}, d_dst_bufs{reinterpret_cast( reinterpret_cast(d_src_and_dst_buffers.data()) + src_bufs_size)} @@ -1119,7 +1119,7 @@ struct packed_src_and_dst_pointers { * @param num_src_bufs number of buffers for the source columns including children * @param out_buffers the destination buffers per partition if in the non-chunked case * @param stream Optional CUDA stream on which to execute kernels - * @param mr RMM memory resource + * @param temp_mr A memory resource for temporary and scratch space * * @returns new unique pointer to packed_src_and_dst_pointers */ @@ -1129,10 +1129,10 @@ std::unique_ptr setup_src_and_dst_pointers( cudf::size_type num_src_bufs, std::vector& out_buffers, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) { - auto src_and_dst_pointers = - std::make_unique(input, num_partitions, num_src_bufs, stream, mr); + auto src_and_dst_pointers = std::make_unique( + input, num_partitions, num_src_bufs, stream, temp_mr); std::transform( out_buffers.begin(), out_buffers.end(), src_and_dst_pointers->h_dst_bufs, [](auto& buf) { @@ -1155,7 +1155,7 @@ std::unique_ptr setup_src_and_dst_pointers( * @param num_src_bufs number of buffers for the source columns including children * @param num_bufs num_src_bufs times the number of partitions * @param stream Optional CUDA stream on which to execute kernels - * @param mr RMM memory resource + * @param temp_mr A memory resource for temporary and scratch space * * @returns new unique pointer to `packed_partition_buf_size_and_dst_buf_info` */ @@ -1166,18 +1166,18 @@ std::unique_ptr compute_splits( cudf::size_type num_src_bufs, std::size_t num_bufs, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) { auto partition_buf_size_and_dst_buf_info = std::make_unique( - input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr); + input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr); auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; auto const h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; auto const d_buf_sizes = partition_buf_size_and_dst_buf_info->d_buf_sizes; - auto const split_indices_and_src_buf_info = - packed_split_indices_and_src_buf_info(input, splits, num_partitions, num_src_bufs, stream, mr); + auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info( + input, splits, num_partitions, num_src_bufs, stream, temp_mr); auto const d_src_buf_info = split_indices_and_src_buf_info.d_src_buf_info; auto const offset_stack_partition_size = @@ -1187,7 +1187,7 @@ std::unique_ptr compute_splits( // compute sizes of each column in each partition, including alignment. thrust::transform( - rmm::exec_policy(stream, mr), + rmm::exec_policy(stream, temp_mr), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_bufs), d_dst_buf_info, @@ -1272,7 +1272,7 @@ std::unique_ptr compute_splits( auto values = cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info}); - thrust::reduce_by_key(rmm::exec_policy(stream, mr), + thrust::reduce_by_key(rmm::exec_policy(stream, temp_mr), keys, keys + num_bufs, values, @@ -1287,7 +1287,7 @@ std::unique_ptr compute_splits( auto values = cudf::detail::make_counting_transform_iterator(0, buf_size_functor{d_dst_buf_info}); - thrust::exclusive_scan_by_key(rmm::exec_policy(stream, mr), + thrust::exclusive_scan_by_key(rmm::exec_policy(stream, temp_mr), keys, keys + num_bufs, values, @@ -1307,7 +1307,7 @@ std::unique_ptr compute_splits( * `copy_partitions` kernel and the number of iterations we need to carry out this copy. * * For the non-chunked contiguous_split case, this contains the batched dst_buf_infos and the - * number of iterations are going to be 1, since the non-chunked case is single pass. + * number of iterations is going to be 1 since the non-chunked case is single pass. * * For the chunked_pack case, this also contains the batched dst_buf_infos for all * iterations in addition to helping keep the state about what batches have been copied so far @@ -1320,8 +1320,8 @@ struct chunk_iteration_state { std::vector&& _h_size_of_buffs_per_iteration, std::size_t total_size) : num_iterations(_h_num_buffs_per_iteration.size()), - current_iteration(0), - starting_batch(0), + current_iteration{0}, + starting_batch{0}, d_batched_dst_buf_info(std::move(_d_batched_dst_buf_info)), d_batch_offsets(std::move(_d_batch_offsets)), h_num_buffs_per_iteration(std::move(_h_num_buffs_per_iteration)), @@ -1330,6 +1330,16 @@ struct chunk_iteration_state { { } + static std::unique_ptr create( + rmm::device_uvector> const& batches, + int num_bufs, + dst_buf_info* d_orig_dst_buf_info, + std::size_t const* const h_buf_sizes, + std::size_t num_partitions, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* temp_mr); + /** * @brief As of the time of the call, return the starting 1MB batch index, and the * number of batches to copy. @@ -1341,9 +1351,17 @@ struct chunk_iteration_state { CUDF_EXPECTS(current_iteration < num_iterations, "current_iteration cannot exceed num_iterations"); auto count_for_current = h_num_buffs_per_iteration[current_iteration]; - return std::make_pair(starting_batch, count_for_current); + return {starting_batch, count_for_current}; } + /** + * @brief Advance the iteration state if there are iterations left, updating the + * starting batch and returning the amount of bytes were copied in the iteration + * we just finished. + * @throws cudf::logic_error If the state was at the last iteration before entering + * this function. + * @return size in bytes that were copied in the finished iteration + */ std::size_t advance_iteration() { CUDF_EXPECTS(current_iteration < num_iterations, @@ -1354,6 +1372,9 @@ struct chunk_iteration_state { return bytes_copied; } + /** + * Returns true if there are iterations left. + */ bool has_more_copies() const { return current_iteration < num_iterations; } rmm::device_uvector d_batched_dst_buf_info; @@ -1368,7 +1389,7 @@ struct chunk_iteration_state { std::vector const h_size_of_buffs_per_iteration; }; -std::unique_ptr make_chunk_iteration_state( +std::unique_ptr chunk_iteration_state::create( rmm::device_uvector> const& batches, int num_bufs, dst_buf_info* d_orig_dst_buf_info, @@ -1376,16 +1397,16 @@ std::unique_ptr make_chunk_iteration_state( std::size_t num_partitions, std::size_t user_buffer_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) { - rmm::device_uvector d_batch_offsets(num_bufs + 1, stream, mr); + rmm::device_uvector d_batch_offsets(num_bufs + 1, stream, temp_mr); auto const buf_count_iter = cudf::detail::make_counting_transform_iterator( 0, [num_bufs, num_batches = num_batches_func{batches.begin()}] __device__(size_type i) { return i == num_bufs ? 0 : num_batches(i); }); - thrust::exclusive_scan(rmm::exec_policy(stream, mr), + thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr), buf_count_iter, buf_count_iter + num_bufs + 1, d_batch_offsets.begin(), @@ -1394,17 +1415,17 @@ std::unique_ptr make_chunk_iteration_state( auto const num_batches_iter = cudf::detail::make_counting_transform_iterator(0, num_batches_func{batches.begin()}); size_type const num_batches = thrust::reduce( - rmm::exec_policy(stream, mr), num_batches_iter, num_batches_iter + batches.size()); + rmm::exec_policy(stream, temp_mr), num_batches_iter, num_batches_iter + batches.size()); auto out_to_in_index = out_to_in_index_function{d_batch_offsets.begin(), num_bufs}; auto const iter = thrust::make_counting_iterator(0); // load up the batches as d_dst_buf_info - rmm::device_uvector d_batched_dst_buf_info(num_batches, stream, mr); + rmm::device_uvector d_batched_dst_buf_info(num_batches, stream, temp_mr); thrust::for_each( - rmm::exec_policy(stream, mr), + rmm::exec_policy(stream, temp_mr), iter, iter + num_batches, [d_orig_dst_buf_info, @@ -1464,11 +1485,11 @@ std::unique_ptr make_chunk_iteration_state( // copy the batch offsets back to host std::vector h_offsets(num_batches + 1); { - rmm::device_uvector offsets(h_offsets.size(), stream, mr); + rmm::device_uvector offsets(h_offsets.size(), stream, temp_mr); auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator( 0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()}); - thrust::exclusive_scan(rmm::exec_policy(stream, mr), + thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr), batch_byte_size_iter, batch_byte_size_iter + num_batches + 1, offsets.begin()); @@ -1526,7 +1547,7 @@ std::unique_ptr make_chunk_iteration_state( // apply changed offset { rmm::device_uvector d_accum_size_per_iteration( - accum_size_per_iteration.size(), stream, mr); + accum_size_per_iteration.size(), stream, temp_mr); CUDF_CUDA_TRY(cudaMemcpyAsync(d_accum_size_per_iteration.data(), accum_size_per_iteration.data(), @@ -1540,7 +1561,7 @@ std::unique_ptr make_chunk_iteration_state( auto const iter = thrust::make_counting_iterator(num_batches_in_first_iteration); auto num_iterations = accum_size_per_iteration.size(); thrust::for_each( - rmm::exec_policy(stream, mr), + rmm::exec_policy(stream, temp_mr), iter, iter + num_batches - num_batches_in_first_iteration, [num_iterations, @@ -1640,10 +1661,10 @@ namespace detail { * is using the single-pass contiguous_split or chunked_pack. * * It exposes an iterator-like pattern where contiguous_split_state::has_next() - * return true when there is work to be done, and false otherwise. + * returns true when there is work to be done, and false otherwise. * * contiguous_split_state::contiguous_split() performs a single-pass contiguous_split - * and is only valid iff contiguous_split_state is instantiated with 0 for the user_buffer_size. + * and is valid iff contiguous_split_state is instantiated with 0 for the user_buffer_size. * * contiguous_split_state::contiguous_split_chunk(device_span) is only valid when * user_buffer_size > 0. It should be called as long as has_next() returns true. The @@ -1656,16 +1677,18 @@ struct contiguous_split_state { contiguous_split_state(cudf::table_view const& input, std::size_t user_buffer_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : contiguous_split_state(input, {}, user_buffer_size, stream, mr) + rmm::mr::device_memory_resource* mr, + rmm::mr::device_memory_resource* temp_mr) + : contiguous_split_state(input, {}, user_buffer_size, stream, mr, temp_mr) { } contiguous_split_state(cudf::table_view const& input, std::vector const& splits, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : contiguous_split_state(input, splits, 0, stream, mr) + rmm::mr::device_memory_resource* mr, + rmm::mr::device_memory_resource* temp_mr) + : contiguous_split_state(input, splits, 0, stream, mr, temp_mr) { } @@ -1673,14 +1696,18 @@ struct contiguous_split_state { std::vector const& splits, std::size_t user_buffer_size, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : input(input), user_buffer_size(user_buffer_size), stream(stream), mr(mr) + rmm::mr::device_memory_resource* mr, + rmm::mr::device_memory_resource* temp_mr) + : input(input), + user_buffer_size(user_buffer_size), + stream(stream), + mr(mr), + temp_mr(temp_mr), + is_empty{check_inputs(input, splits)}, + num_partitions{splits.size() + 1}, + num_src_bufs{count_src_bufs(input.begin(), input.end())}, + num_bufs{num_src_bufs * num_partitions} { - is_empty = check_inputs(input, splits); - num_partitions = splits.size() + 1; - num_src_bufs = count_src_bufs(input.begin(), input.end()); - num_bufs = num_src_bufs * num_partitions; - // if the table we are about to contig split is empty, we have special // handling where metadata is produced and a 0-byte contiguous buffer // is the result. @@ -1689,8 +1716,8 @@ struct contiguous_split_state { // First pass over the source tables to generate a `dst_buf_info` per split and column buffer // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work // into 1MB batches in `compute_batches` - partition_buf_size_and_dst_buf_info = - std::move(compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, mr)); + partition_buf_size_and_dst_buf_info = std::move( + compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr)); // Second pass: uses `dst_buf_info` to break down the work into 1MB batches. compute_batches(); @@ -1707,8 +1734,8 @@ struct contiguous_split_state { }); } - src_and_dst_pointers = std::move( - setup_src_and_dst_pointers(input, num_partitions, num_src_bufs, out_buffers, stream, mr)); + src_and_dst_pointers = std::move(setup_src_and_dst_pointers( + input, num_partitions, num_src_bufs, out_buffers, stream, temp_mr)); } bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); } @@ -1720,16 +1747,16 @@ struct contiguous_split_state { void compute_batches() { - // Since we parallelize at one block per copy, we are vulnerable to situations where we + // Since we parallelize at one block per copy, performance is vulnerable to situations where we // have small numbers of copies to do (a combination of small numbers of splits and/or columns), // so we will take the actual set of outgoing source/destination buffers and further partition // them into much smaller batches in order to drive up the number of blocks and overall // occupancy. - rmm::device_uvector> batches(num_bufs, stream, mr); + rmm::device_uvector> batches(num_bufs, stream, temp_mr); auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; thrust::transform( - rmm::exec_policy(stream, mr), + rmm::exec_policy(stream, temp_mr), d_dst_buf_info, d_dst_buf_info + num_bufs, batches.begin(), @@ -1753,8 +1780,14 @@ struct contiguous_split_state { return {num_batches, desired_batch_size}; }); - chunk_iter_state = make_chunk_iteration_state( - batches, num_bufs, d_dst_buf_info, h_buf_sizes, num_partitions, user_buffer_size, stream, mr); + chunk_iter_state = chunk_iteration_state::create(batches, + num_bufs, + d_dst_buf_info, + h_buf_sizes, + num_partitions, + user_buffer_size, + stream, + temp_mr); } std::vector contiguous_split() @@ -1788,7 +1821,7 @@ struct contiguous_split_state { chunk_iter_state->d_batched_dst_buf_info.begin(), [] __device__(dst_buf_info const& info) { return info.valid_count; }); - thrust::reduce_by_key(rmm::exec_policy(stream, mr), + thrust::reduce_by_key(rmm::exec_policy(stream, temp_mr), keys, keys + num_batches_total, values, @@ -1925,29 +1958,31 @@ struct contiguous_split_state { } cudf::table_view const input; - rmm::cuda_stream_view stream; - rmm::mr::device_memory_resource* mr; + std::size_t const user_buffer_size; + rmm::cuda_stream_view const stream; + rmm::mr::device_memory_resource* const mr; + rmm::mr::device_memory_resource* const temp_mr; - std::size_t num_partitions; + // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only + bool const is_empty; - // number of source buffers including children * number of splits - std::size_t num_bufs; + std::size_t const num_partitions; // number of source buffers including children - size_type num_src_bufs; + size_type const num_src_bufs; + + // number of source buffers including children * number of splits + std::size_t const num_bufs; std::unique_ptr partition_buf_size_and_dst_buf_info; std::unique_ptr src_and_dst_pointers; - // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only - bool is_empty; - // // State around the chunked pattern // - // chunked_pack will 1 or more "chunks" to iterate on, defined in chunk_iter_state + // chunked_pack will have 1 or more "chunks" to iterate on, defined in chunk_iter_state // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration. std::unique_ptr chunk_iter_state; @@ -1964,8 +1999,6 @@ struct contiguous_split_state { // each buffer. // std::vector out_buffers; - - std::size_t user_buffer_size; }; std::vector contiguous_split(cudf::table_view const& input, @@ -1973,7 +2006,10 @@ std::vector contiguous_split(cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto state = contiguous_split_state(input, splits, stream, mr); + // `temp_mr` is the same as `mr` for contiguous_split as it allocates all + // of its memory from the default memory resource in cuDF + auto temp_mr = mr; + auto state = contiguous_split_state(input, splits, stream, mr, temp_mr); return state.contiguous_split(); } @@ -1989,10 +2025,14 @@ std::vector contiguous_split(cudf::table_view const& input, chunked_pack::chunked_pack(cudf::table_view const& input, std::size_t user_buffer_size, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) { + CUDF_EXPECTS(user_buffer_size >= desired_batch_size, + "The output buffer size must be at least 1MB in size"); + // We pass `nullptr` for the first `mr` in `contiguous_split_state` to indicate + // that it does not allocate any user-bound data for the `chunked_pack` case. state = std::make_unique( - input, user_buffer_size, cudf::get_default_stream(), mr); + input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr); } // required for the unique_ptr to work with a non-complete type (contiguous_split_state) @@ -2017,11 +2057,12 @@ std::unique_ptr> chunked_pack::build_metadata() const std::unique_ptr chunked_pack::create(cudf::table_view const& input, std::size_t user_buffer_size, - rmm::mr::device_memory_resource* mr) + rmm::mr::device_memory_resource* temp_mr) { - CUDF_EXPECTS(user_buffer_size >= desired_batch_size, - "The output buffer size must be at least 1MB in size"); - return std::make_unique(input, user_buffer_size, mr); + // `temp_mr` could be a special memory resource to be used in situations when + // GPU memory is low and we want scratch and temporary allocations to happen from + // a small reserved pool of memory. + return std::make_unique(input, user_buffer_size, temp_mr); } }; // namespace cudf From ff6ce21196af5ad479fe7d154ed899574220c938 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 17 May 2023 08:45:04 -0500 Subject: [PATCH 22/31] bif_shift -> bit_shift --- cpp/src/copying/contiguous_split.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index d982c11aee8..1d60522f384 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -124,7 +124,7 @@ struct dst_buf_info { * Copies a single partition of a source column buffer to a destination buffer. Shifts * element values by value_shift in the case of a buffer of offsets (value_shift will * only ever be > 0 in that case). Shifts elements bitwise by bit_shift in the case of - * a validity buffer (bif_shift will only ever be > 0 in that case). This function assumes + * a validity buffer (bit_shift will only ever be > 0 in that case). This function assumes * value_shift and bit_shift will never be > 0 at the same time. * * This function expects: From b47f2c761bbf396dc833e660172ab0a1f40514da Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 17 May 2023 12:52:09 -0500 Subject: [PATCH 23/31] Leverage make_device_uvector_async --- cpp/src/copying/contiguous_split.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 1d60522f384..54e8b3b3894 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1546,14 +1547,8 @@ std::unique_ptr chunk_iteration_state::create( // apply changed offset { - rmm::device_uvector d_accum_size_per_iteration( - accum_size_per_iteration.size(), stream, temp_mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync(d_accum_size_per_iteration.data(), - accum_size_per_iteration.data(), - accum_size_per_iteration.size() * sizeof(std::size_t), - cudaMemcpyDefault, - stream.value())); + auto d_accum_size_per_iteration = + cudf::detail::make_device_uvector_async(accum_size_per_iteration, stream, temp_mr); // we want to update the offset of batches for every iteration, except the first one (because // offsets in the first iteration are all 0 based) From b1fba4e397d0b867dd5dfcbfaf2320f072cf540f Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 17 May 2023 12:56:58 -0500 Subject: [PATCH 24/31] Remove extra brackets and use range-based for loop Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/copying/contiguous_split.cu | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 1d60522f384..94606b1603f 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1639,15 +1639,12 @@ bool check_inputs(cudf::table_view const& input, std::vector const& s CUDF_EXPECTS(splits.back() <= input.column(0).size(), "splits can't exceed size of input columns"); } - { - size_type begin = 0; - for (std::size_t i = 0; i < splits.size(); i++) { - size_type end = splits[i]; - CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); - CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); - CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds."); - begin = end; - } + size_type begin = 0; + for (auto end : splits) { + CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); + CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); + CUDF_EXPECTS(end <= input.column(0).size(), "Slice range out of bounds."); + begin = end; } return input.column(0).size() == 0; } From ab98d70bce02b59dab1f38a8da8162a999a70167 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 17 May 2023 13:53:36 -0500 Subject: [PATCH 25/31] Remove extra cast and use .front --- cpp/src/copying/contiguous_split.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index b7582787ae5..8854fba8520 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1883,7 +1883,7 @@ struct contiguous_split_state { return packed_table{ empty_inputs, packed_columns{std::make_unique>(pack_metadata( - empty_inputs, static_cast(nullptr), 0)), + empty_inputs, nullptr, 0)), std::make_unique()}}; }); @@ -1899,7 +1899,7 @@ struct contiguous_split_state { if (is_empty) { // this is a bit ugly, but it was done to re-use make_empty_packed_table between the // regular contiguous_split and chunked_pack cases. - auto empty_packed_tables = std::move(make_empty_packed_table()[0]); + auto empty_packed_tables = std::move(make_empty_packed_table().front()); return std::move(empty_packed_tables.data.metadata); } From 8ec29fa0e0f72d96dc902b2cc6f715cd2dedfaa6 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 17 May 2023 13:55:08 -0500 Subject: [PATCH 26/31] Fix styles --- cpp/src/copying/contiguous_split.cu | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 8854fba8520..c8625191571 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1880,11 +1880,10 @@ struct contiguous_split_state { iter + num_partitions, std::back_inserter(result), [&empty_inputs](int partition_index) { - return packed_table{ - empty_inputs, - packed_columns{std::make_unique>(pack_metadata( - empty_inputs, nullptr, 0)), - std::make_unique()}}; + return packed_table{empty_inputs, + packed_columns{std::make_unique>( + pack_metadata(empty_inputs, nullptr, 0)), + std::make_unique()}}; }); return result; From d959039fa3b06d94d3895770699f23d3d1e062fd Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 18 May 2023 10:07:23 -0500 Subject: [PATCH 27/31] Move temp_mr comment to chunked_pack::create and make sure temp_mr defaults to the default memory resource --- cpp/include/cudf/contiguous_split.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index ce995a5c32a..1a4664d8d58 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -251,6 +251,11 @@ class chunked_pack { * "input", where a buffer of `user_buffer_size` is filled with chunks of the * overall operation. This operation can be used in cases where GPU memory is constrained. * + * The memory resource (`temp_mr`) could be a special memory resource to be used in + * situations when GPU memory is low and we want scratch and temporary allocations to + * happen from a small reserved pool of memory. Note that it defaults to the regular cuDF + * per-device resource. + * * @throws cudf::logic_error When user_buffer_size is less than 1MB * * @param input source `table_view` to pack @@ -262,7 +267,7 @@ class chunked_pack { [[nodiscard]] static std::unique_ptr create( cudf::table_view const& input, std::size_t user_buffer_size, - rmm::mr::device_memory_resource* temp_mr); + rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource()); private: // internal state of contiguous split From e16306cbf58bc6d9556d7c1285d850c166414d18 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 18 May 2023 10:29:33 -0500 Subject: [PATCH 28/31] Make contiguous_split_state constructor private and along with it other private functions. Make compute_batches a free function --- cpp/include/cudf/contiguous_split.hpp | 2 +- cpp/src/copying/contiguous_split.cu | 278 ++++++++++++++------------ 2 files changed, 153 insertions(+), 127 deletions(-) diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp index 1a4664d8d58..83c852cd918 100644 --- a/cpp/include/cudf/contiguous_split.hpp +++ b/cpp/include/cudf/contiguous_split.hpp @@ -251,7 +251,7 @@ class chunked_pack { * "input", where a buffer of `user_buffer_size` is filled with chunks of the * overall operation. This operation can be used in cases where GPU memory is constrained. * - * The memory resource (`temp_mr`) could be a special memory resource to be used in + * The memory resource (`temp_mr`) could be a special memory resource to be used in * situations when GPU memory is low and we want scratch and temporary allocations to * happen from a small reserved pool of memory. Note that it defaults to the regular cuDF * per-device resource. diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index c8625191571..37fd6e04c36 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1116,7 +1116,7 @@ struct packed_src_and_dst_pointers { * to the `copy_partitions` kernel. * * @param input source table view - * @param num_partitions the number of partitions create (1 meaning no splits) + * @param num_partitions the number of partitions (1 meaning no splits) * @param num_src_bufs number of buffers for the source columns including children * @param out_buffers the destination buffers per partition if in the non-chunked case * @param stream Optional CUDA stream on which to execute kernels @@ -1592,6 +1592,73 @@ std::unique_ptr chunk_iteration_state::create( } } +/** + * @brief Create an instance of `chunk_iteration_state` containing 1MB batches of work + * that are further grouped into chunks or iterations. + * + * This function handles both the `chunked_pack` case: when `user_buffer_size` is non-zero, + * and the single-shot `contiguous_split` case. + * + * @param num_bufs num_src_bufs times the number of partitions + * @param d_dst_buf_info dst_buf_info per partition produced in `compute_splits` + * @param h_buf_sizes size in bytes of a partition (accessible from host) + * @param num_partitions the number of partitions (1 meaning no splits) + * @param user_buffer_size if non-zero, it is the size in bytes that 1MB batches should be + * grouped in, as different iterations. + * @param stream Optional CUDA stream on which to execute kernels + * @param temp_mr A memory resource for temporary and scratch space + * + * @returns new unique pointer to `chunk_iteration_state` + */ +std::unique_ptr compute_batches(int num_bufs, + dst_buf_info* const d_dst_buf_info, + std::size_t const* const h_buf_sizes, + std::size_t num_partitions, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* temp_mr) +{ + // Since we parallelize at one block per copy, performance is vulnerable to situations where we + // have small numbers of copies to do (a combination of small numbers of splits and/or columns), + // so we will take the actual set of outgoing source/destination buffers and further partition + // them into much smaller batches in order to drive up the number of blocks and overall + // occupancy. + rmm::device_uvector> batches(num_bufs, stream, temp_mr); + thrust::transform( + rmm::exec_policy(stream, temp_mr), + d_dst_buf_info, + d_dst_buf_info + num_bufs, + batches.begin(), + [desired_batch_size = desired_batch_size] __device__( + dst_buf_info const& buf) -> thrust::pair { + // Total bytes for this incoming partition + std::size_t const bytes = + static_cast(buf.num_elements) * static_cast(buf.element_size); + + // This clause handles nested data types (e.g. list or string) that store no data in the row + // columns, only in their children. + if (bytes == 0) { return {1, 0}; } + + // The number of batches we want to subdivide this buffer into + std::size_t const num_batches = std::max( + std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size); + + // NOTE: leaving batch size as a separate parameter for future tuning + // possibilities, even though in the current implementation it will be a + // constant. + return {num_batches, desired_batch_size}; + }); + + return chunk_iteration_state::create(batches, + num_bufs, + d_dst_buf_info, + h_buf_sizes, + num_partitions, + user_buffer_size, + stream, + temp_mr); +} + void copy_data(int num_batches_to_copy, int starting_batch, uint8_t const** d_src_bufs, @@ -1684,52 +1751,6 @@ struct contiguous_split_state { { } - contiguous_split_state(cudf::table_view const& input, - std::vector const& splits, - std::size_t user_buffer_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr, - rmm::mr::device_memory_resource* temp_mr) - : input(input), - user_buffer_size(user_buffer_size), - stream(stream), - mr(mr), - temp_mr(temp_mr), - is_empty{check_inputs(input, splits)}, - num_partitions{splits.size() + 1}, - num_src_bufs{count_src_bufs(input.begin(), input.end())}, - num_bufs{num_src_bufs * num_partitions} - { - // if the table we are about to contig split is empty, we have special - // handling where metadata is produced and a 0-byte contiguous buffer - // is the result. - if (is_empty) { return; } - - // First pass over the source tables to generate a `dst_buf_info` per split and column buffer - // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work - // into 1MB batches in `compute_batches` - partition_buf_size_and_dst_buf_info = std::move( - compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr)); - - // Second pass: uses `dst_buf_info` to break down the work into 1MB batches. - compute_batches(); - - // allocate output partition buffers, in the non-chunked case - if (user_buffer_size == 0) { - out_buffers.reserve(num_partitions); - auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; - std::transform(h_buf_sizes, - h_buf_sizes + num_partitions, - std::back_inserter(out_buffers), - [stream = stream, mr = mr](std::size_t bytes) { - return rmm::device_buffer{bytes, stream, mr}; - }); - } - - src_and_dst_pointers = std::move(setup_src_and_dst_pointers( - input, num_partitions, num_src_bufs, out_buffers, stream, temp_mr)); - } - bool has_next() const { return !is_empty && chunk_iter_state->has_more_copies(); } std::size_t get_total_contiguous_size() const @@ -1737,51 +1758,6 @@ struct contiguous_split_state { return is_empty ? 0 : chunk_iter_state->total_size; } - void compute_batches() - { - // Since we parallelize at one block per copy, performance is vulnerable to situations where we - // have small numbers of copies to do (a combination of small numbers of splits and/or columns), - // so we will take the actual set of outgoing source/destination buffers and further partition - // them into much smaller batches in order to drive up the number of blocks and overall - // occupancy. - rmm::device_uvector> batches(num_bufs, stream, temp_mr); - auto d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info; - auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; - thrust::transform( - rmm::exec_policy(stream, temp_mr), - d_dst_buf_info, - d_dst_buf_info + num_bufs, - batches.begin(), - [desired_batch_size = desired_batch_size] __device__( - dst_buf_info const& buf) -> thrust::pair { - // Total bytes for this incoming partition - std::size_t const bytes = - static_cast(buf.num_elements) * static_cast(buf.element_size); - - // This clause handles nested data types (e.g. list or string) that store no data in the row - // columns, only in their children. - if (bytes == 0) { return {1, 0}; } - - // The number of batches we want to subdivide this buffer into - std::size_t const num_batches = std::max( - std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size); - - // NOTE: leaving batch size as a separate parameter for future tuning - // possibilities, even though in the current implementation it will be a - // constant. - return {num_batches, desired_batch_size}; - }); - - chunk_iter_state = chunk_iteration_state::create(batches, - num_bufs, - d_dst_buf_info, - h_buf_sizes, - num_partitions, - user_buffer_size, - stream, - temp_mr); - } - std::vector contiguous_split() { CUDF_EXPECTS(user_buffer_size == 0, "Cannot contiguous split with a user buffer"); @@ -1837,7 +1813,7 @@ struct contiguous_split_state { cudf::size_type contiguous_split_chunk(cudf::device_span const& user_buffer) { - CUDF_FUNC_RANGE() + CUDF_FUNC_RANGE(); CUDF_EXPECTS( user_buffer.size() == user_buffer_size, "Cannot use a device span smaller than the output buffer size configured at instantiation!"); @@ -1860,35 +1836,6 @@ struct contiguous_split_state { return chunk_iter_state->advance_iteration(); } - std::vector make_empty_packed_table() - { - // sanitize the inputs (to handle corner cases like sliced tables) - std::vector empty_column_views; - empty_column_views.reserve(input.num_columns()); - std::transform(input.begin(), - input.end(), - std::back_inserter(empty_column_views), - [](column_view const& col) { return cudf::empty_like(col)->view(); }); - - table_view empty_inputs(empty_column_views); - - // build the empty results - std::vector result; - result.reserve(num_partitions); - auto const iter = thrust::make_counting_iterator(0); - std::transform(iter, - iter + num_partitions, - std::back_inserter(result), - [&empty_inputs](int partition_index) { - return packed_table{empty_inputs, - packed_columns{std::make_unique>( - pack_metadata(empty_inputs, nullptr, 0)), - std::make_unique()}}; - }); - - return result; - } - std::unique_ptr> build_packed_column_metadata() { CUDF_EXPECTS(num_partitions == 1, "build_packed_column_metadata supported only without splits"); @@ -1911,6 +1858,59 @@ struct contiguous_split_state { return std::make_unique>(std::move(mb.build())); } + private: + contiguous_split_state(cudf::table_view const& input, + std::vector const& splits, + std::size_t user_buffer_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr, + rmm::mr::device_memory_resource* temp_mr) + : input(input), + user_buffer_size(user_buffer_size), + stream(stream), + mr(mr), + temp_mr(temp_mr), + is_empty{check_inputs(input, splits)}, + num_partitions{splits.size() + 1}, + num_src_bufs{count_src_bufs(input.begin(), input.end())}, + num_bufs{num_src_bufs * num_partitions} + { + // if the table we are about to contig split is empty, we have special + // handling where metadata is produced and a 0-byte contiguous buffer + // is the result. + if (is_empty) { return; } + + // First pass over the source tables to generate a `dst_buf_info` per split and column buffer + // (`num_bufs`). After this, contiguous_split uses `dst_buf_info` to further subdivide the work + // into 1MB batches in `compute_batches` + partition_buf_size_and_dst_buf_info = std::move( + compute_splits(input, splits, num_partitions, num_src_bufs, num_bufs, stream, temp_mr)); + + // Second pass: uses `dst_buf_info` to break down the work into 1MB batches. + chunk_iter_state = compute_batches(num_bufs, + partition_buf_size_and_dst_buf_info->d_dst_buf_info, + partition_buf_size_and_dst_buf_info->h_buf_sizes, + num_partitions, + user_buffer_size, + stream, + temp_mr); + + // allocate output partition buffers, in the non-chunked case + if (user_buffer_size == 0) { + out_buffers.reserve(num_partitions); + auto h_buf_sizes = partition_buf_size_and_dst_buf_info->h_buf_sizes; + std::transform(h_buf_sizes, + h_buf_sizes + num_partitions, + std::back_inserter(out_buffers), + [stream = stream, mr = mr](std::size_t bytes) { + return rmm::device_buffer{bytes, stream, mr}; + }); + } + + src_and_dst_pointers = std::move(setup_src_and_dst_pointers( + input, num_partitions, num_src_bufs, out_buffers, stream, temp_mr)); + } + std::vector make_packed_tables() { if (input.num_columns() == 0) { return std::vector(); } @@ -1948,6 +1948,35 @@ struct contiguous_split_state { return result; } + std::vector make_empty_packed_table() + { + // sanitize the inputs (to handle corner cases like sliced tables) + std::vector empty_column_views; + empty_column_views.reserve(input.num_columns()); + std::transform(input.begin(), + input.end(), + std::back_inserter(empty_column_views), + [](column_view const& col) { return cudf::empty_like(col)->view(); }); + + table_view empty_inputs(empty_column_views); + + // build the empty results + std::vector result; + result.reserve(num_partitions); + auto const iter = thrust::make_counting_iterator(0); + std::transform(iter, + iter + num_partitions, + std::back_inserter(result), + [&empty_inputs](int partition_index) { + return packed_table{empty_inputs, + packed_columns{std::make_unique>( + pack_metadata(empty_inputs, nullptr, 0)), + std::make_unique()}}; + }); + + return result; + } + cudf::table_view const input; std::size_t const user_buffer_size; rmm::cuda_stream_view const stream; @@ -2026,7 +2055,7 @@ chunked_pack::chunked_pack(cudf::table_view const& input, input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr); } -// required for the unique_ptr to work with a non-complete type (contiguous_split_state) +// required for the unique_ptr to work with a incomplete type (contiguous_split_state) chunked_pack::~chunked_pack() = default; std::size_t chunked_pack::get_total_contiguous_size() const @@ -2050,9 +2079,6 @@ std::unique_ptr chunked_pack::create(cudf::table_view const& input std::size_t user_buffer_size, rmm::mr::device_memory_resource* temp_mr) { - // `temp_mr` could be a special memory resource to be used in situations when - // GPU memory is low and we want scratch and temporary allocations to happen from - // a small reserved pool of memory. return std::make_unique(input, user_buffer_size, temp_mr); } From b8dcff8566948804b6f5c3b46420db3dd909b0f0 Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 18 May 2023 10:49:27 -0500 Subject: [PATCH 29/31] Fix comment as per review --- cpp/src/copying/contiguous_split.cu | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 37fd6e04c36..060537ebb1b 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -2006,16 +2006,14 @@ struct contiguous_split_state { // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration. std::unique_ptr chunk_iter_state; - // Two modes are allowed: - // - user provided buffer: as the name implies, the user has provided a buffer that must be at - // least 1MB. - // contiguous_split will behave in a "chunked" mode in this scenario, as it will contiguously - // copy up until the user's buffer size limit, exposing a next() call for the user to invoke. - // Note that in this mode, contig split is not partitioning the original table, it is instead - // only placing cuDF buffers contiguously in the user's buffer. + // Two API usages are allowed: + // - `chunked_pack`: for this mode, the user will provide a buffer that must be at least 1MB. + // The behavior is "chunked" in that it will contiguously copy up until the user specified + // `user_buffer_size` limit, exposing a next() call for the user to invoke. Note that in this + // mode, no partitioning occurs, hence the name "pack". // - // - single shot contiguous_split (default): when the user doesn't provide their own buffer, - // contiguous_split will allocate a buffer per partition and will place contiguous results in + // - `contiguous_split` (default): when the user doesn't provide their own buffer, + // `contiguous_split` will allocate a buffer per partition and will place contiguous results in // each buffer. // std::vector out_buffers; From a41db753b51eb6081b0d08188e5f781a025a461f Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 18 May 2023 11:23:08 -0500 Subject: [PATCH 30/31] Add doxygen coments in contiguous_split_state and chunk_iteration_state --- cpp/src/copying/contiguous_split.cu | 52 +++++++++++++++++------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 060537ebb1b..8f5a34fb8d9 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1378,16 +1378,18 @@ struct chunk_iteration_state { */ bool has_more_copies() const { return current_iteration < num_iterations; } - rmm::device_uvector d_batched_dst_buf_info; - rmm::device_uvector const d_batch_offsets; - std::size_t const total_size; - int const num_iterations; - int current_iteration; + rmm::device_uvector d_batched_dst_buf_info; ///< dst_buf_info per 1MB batch + rmm::device_uvector const + d_batch_offsets; ///< Offset within a batch per dst_buf_info + std::size_t const total_size; ///< The aggregate size of all iterations + int const num_iterations; ///< The total number of iterations + int current_iteration; ///< Marks the current iteration being worked on private: - std::size_t starting_batch; - std::vector const h_num_buffs_per_iteration; - std::vector const h_size_of_buffs_per_iteration; + std::size_t starting_batch; ///< Starting batch index for the current iteration + std::vector const h_num_buffs_per_iteration; ///< The count of batches per iteration + std::vector const + h_size_of_buffs_per_iteration; ///< The size in bytes per iteration }; std::unique_ptr chunk_iteration_state::create( @@ -1977,26 +1979,30 @@ struct contiguous_split_state { return result; } - cudf::table_view const input; - std::size_t const user_buffer_size; + cudf::table_view const input; ///< The input table_view to operate on + std::size_t const user_buffer_size; ///< The size of the user buffer for the chunked_pack case rmm::cuda_stream_view const stream; - rmm::mr::device_memory_resource* const mr; - rmm::mr::device_memory_resource* const temp_mr; + rmm::mr::device_memory_resource* const mr; ///< The memory resource for any data returned + + // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the + // `chunked_pack` case to allocate scratch/temp memory in a pool + rmm::mr::device_memory_resource* const temp_mr; ///< The memory resource for scratch/temp space // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only - bool const is_empty; + bool const is_empty; ///< True if the source table has 0 rows or 0 columns - std::size_t const num_partitions; + // This can be 1 if `contiguous_split` is just packing and not splitting + std::size_t const num_partitions; ///< The number of partitions to produce - // number of source buffers including children - size_type const num_src_bufs; + size_type const num_src_bufs; ///< Number of source buffers including children - // number of source buffers including children * number of splits - std::size_t const num_bufs; + std::size_t const num_bufs; ///< Number of source buffers including children * number of splits - std::unique_ptr partition_buf_size_and_dst_buf_info; + std::unique_ptr + partition_buf_size_and_dst_buf_info; ///< Per-partition buffer size and destination buffer info - std::unique_ptr src_and_dst_pointers; + std::unique_ptr + src_and_dst_pointers; ///< Src. and dst. pointers for `copy_partition` // // State around the chunked pattern @@ -2004,7 +2010,8 @@ struct contiguous_split_state { // chunked_pack will have 1 or more "chunks" to iterate on, defined in chunk_iter_state // contiguous_split will have a single "chunk" in chunk_iter_state, so no iteration. - std::unique_ptr chunk_iter_state; + std::unique_ptr + chunk_iter_state; ///< State object for chunk iteration state // Two API usages are allowed: // - `chunked_pack`: for this mode, the user will provide a buffer that must be at least 1MB. @@ -2016,7 +2023,8 @@ struct contiguous_split_state { // `contiguous_split` will allocate a buffer per partition and will place contiguous results in // each buffer. // - std::vector out_buffers; + std::vector + out_buffers; ///< Buffers allocated for a regular `contiguous_split` }; std::vector contiguous_split(cudf::table_view const& input, From bf208b4a455e3ae2d4baf3099fa0b6cb7afcab6a Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Thu, 18 May 2023 11:30:41 -0500 Subject: [PATCH 31/31] Make packed_split_indices_and_src_buf_info not explicit --- cpp/src/copying/contiguous_split.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 8f5a34fb8d9..2b885650526 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -952,12 +952,12 @@ struct out_to_in_index_function { // packed block of memory 1: split indices and src_buf_info structs struct packed_split_indices_and_src_buf_info { - explicit packed_split_indices_and_src_buf_info(cudf::table_view const& input, - std::vector const& splits, - std::size_t num_partitions, - cudf::size_type num_src_bufs, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* temp_mr) + packed_split_indices_and_src_buf_info(cudf::table_view const& input, + std::vector const& splits, + std::size_t num_partitions, + cudf::size_type num_src_bufs, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* temp_mr) : indices_size( cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)), src_buf_info_size(